ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Common.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_COMMON_H__
19#define __NNFW_CKER_COMMON_H__
20
22#include "cker/Utils.h"
23
24namespace nnfw
25{
26namespace cker
27{
28
29inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data,
30 int array_size, float *array_data)
31{
32 // Note: see b/132215220: in May 2019 we thought it would be OK to replace
33 // this with the Eigen one-liner:
34 // return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
35 // This turned out to severely regress performance: +4ms (i.e. 8%) on
36 // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
37 assert((array_size % bias_size) == 0);
38#ifdef USE_NEON
39 float *array_ptr = array_data;
40 float *array_end_ptr = array_ptr + array_size;
41 const auto clamp_min_vec = vdupq_n_f32(clamp_min);
42 const auto clamp_max_vec = vdupq_n_f32(clamp_max);
43 for (; array_ptr != array_end_ptr; array_ptr += bias_size)
44 {
45 int i = 0;
46 for (; i <= bias_size - 16; i += 16)
47 {
48 auto b0 = vld1q_f32(bias_data + i);
49 auto b1 = vld1q_f32(bias_data + i + 4);
50 auto b2 = vld1q_f32(bias_data + i + 8);
51 auto b3 = vld1q_f32(bias_data + i + 12);
52 auto a0 = vld1q_f32(array_ptr + i);
53 auto a1 = vld1q_f32(array_ptr + i + 4);
54 auto a2 = vld1q_f32(array_ptr + i + 8);
55 auto a3 = vld1q_f32(array_ptr + i + 12);
56 auto x0 = vaddq_f32(a0, b0);
57 auto x1 = vaddq_f32(a1, b1);
58 auto x2 = vaddq_f32(a2, b2);
59 auto x3 = vaddq_f32(a3, b3);
60 x0 = vmaxq_f32(clamp_min_vec, x0);
61 x1 = vmaxq_f32(clamp_min_vec, x1);
62 x2 = vmaxq_f32(clamp_min_vec, x2);
63 x3 = vmaxq_f32(clamp_min_vec, x3);
64 x0 = vminq_f32(clamp_max_vec, x0);
65 x1 = vminq_f32(clamp_max_vec, x1);
66 x2 = vminq_f32(clamp_max_vec, x2);
67 x3 = vminq_f32(clamp_max_vec, x3);
68 vst1q_f32(array_ptr + i, x0);
69 vst1q_f32(array_ptr + i + 4, x1);
70 vst1q_f32(array_ptr + i + 8, x2);
71 vst1q_f32(array_ptr + i + 12, x3);
72 }
73 for (; i <= bias_size - 4; i += 4)
74 {
75 auto b = vld1q_f32(bias_data + i);
76 auto a = vld1q_f32(array_ptr + i);
77 auto x = vaddq_f32(a, b);
78 x = vmaxq_f32(clamp_min_vec, x);
79 x = vminq_f32(clamp_max_vec, x);
80 vst1q_f32(array_ptr + i, x);
81 }
82 for (; i < bias_size; i++)
83 {
84 array_ptr[i] =
85 ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
86 }
87 }
88#else // not NEON
89 for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
90 {
91 for (int i = 0; i < bias_size; i++)
92 {
93 array_data[array_offset + i] = ActivationFunctionWithMinMax(
94 array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
95 }
96 }
97#endif
98}
99
100} // namespace cker
101} // namespace nnfw
102
103#endif // __NNFW_CKER_COMMON_H__
void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data, int array_size, float *array_data)
Definition Common.h:29
T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition Utils.h:43
Definition topk_v2.h:30