ONE - On-device Neural Engine
Loading...
Searching...
No Matches
optimized_ops Namespace Reference

Data Structures

struct  FloatDepthwiseConvKernel
 

Functions

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void FloatDepthwiseConvAccumRow (int stride, int input_depth, int input_width, const float *input_data, int pad_width, int depth_multiplier, int filter_width, const float *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, float *acc_buffer)
 
void FloatDepthwiseConvAccumRowGeneric (int stride, int input_depth, int input_width, const float *input_data, int pad_width, int depth_multiplier, int filter_width, const float *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, float *acc_buffer)
 
void DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const float *bias_data, float *acc_buffer)
 
template<FusedActivationFunctionType Ac>
void DepthwiseConv (const float *input_data, const Dims< 4 > &input_dims, const float *filter_data, const Dims< 4 > &filter_dims, const float *bias_data, const Dims< 4 > &bias_dims, int stride_width, int stride_height, int pad_width, int pad_height, int depth_multiplier, float *output_data, const Dims< 4 > &output_dims)
 

Function Documentation

◆ DepthwiseConv()

void optimized_ops::DepthwiseConv ( const float *  input_data,
const Dims< 4 > &  input_dims,
const float *  filter_data,
const Dims< 4 > &  filter_dims,
const float *  bias_data,
const Dims< 4 > &  bias_dims,
int  stride_width,
int  stride_height,
int  pad_width,
int  pad_height,
int  depth_multiplier,
float *  output_data,
const Dims< 4 > &  output_dims 
)

Definition at line 165 of file DepthwiseConv2D.float.cpp.

169{
170 static_assert(
173 "");
174 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
175 const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
176 const int input_height = ArraySize(input_dims, 2);
177 const int input_width = ArraySize(input_dims, 1);
178 const int input_depth = ArraySize(input_dims, 0);
179 const int filter_height = ArraySize(filter_dims, 2);
180 const int filter_width = ArraySize(filter_dims, 1);
181 const int output_height = ArraySize(output_dims, 2);
182 const int output_width = ArraySize(output_dims, 1);
183#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input *
184 // depthmultiplier
185 DCHECK(output_depth == input_depth * depth_multiplier);
186#endif
187
188 static const int kAccBufferMaxSize = 1024;
189 float acc_buffer[kAccBufferMaxSize];
190 DCHECK_GE(kAccBufferMaxSize, output_depth);
191 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
192 const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
193 DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
194 DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
195 DCHECK_GE(kOutputPixelsInAccBuffer, 1);
196
197 // row_accum_func will point to the core accumulation function to be used
198 // for this DepthwiseConv op.
199 auto *row_accum_func = FloatDepthwiseConvAccumRowGeneric;
200
201 const int kMaxFixedDepthMultiplier = 16;
202 int fixed_depth_multiplier = 0;
203 if (depth_multiplier <= kMaxFixedDepthMultiplier)
204 {
205 fixed_depth_multiplier = depth_multiplier;
206 }
207 // kMaxUnrolling is the max number of output values that we aim to handle
208 // in one unrolled iteration of the inner loop. For practical performance
209 // reasons, it is limited by the number of available registers. We could
210 // fine-tune it depending on the architecture, but that's not worth doing
211 // since this whole code is not very optimized to begin with. The
212 // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
213 // vector registers.
214 const int kMaxUnrolling = 8;
215 int fixed_input_depth = 0;
216 if (fixed_depth_multiplier && input_depth * fixed_depth_multiplier <= kMaxUnrolling)
217 {
218 fixed_input_depth = input_depth;
219 }
220
221 // Now that we have determined row_accum_func, we can start work.
222 float *output_ptr = output_data;
223 for (int b = 0; b < batches; ++b)
224 {
225 for (int out_y = 0; out_y < output_height; ++out_y)
226 {
227 const int in_y_origin = (out_y * stride_height) - pad_height;
228 const int filter_y_start = std::max(0, -in_y_origin);
229 const int filter_y_end = std::min(filter_height, input_height - in_y_origin);
230 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
231 out_x_buffer_start += kOutputPixelsInAccBuffer)
232 {
233 const int out_x_buffer_end =
234 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
235 // We call a 'pixel' a group of activation that share all but the
236 // 'depth'/'channel' coordinate. num_output_pixels is the number of
237 // output pixels that we will accumulate in this loop iteration.
238 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
239 // Initialize our local accumulator with the bias values, so we don't
240 // have to add them later.
241 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
242 // Accumulation loop. Most of the time should be spent in here.
243 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
244 {
245 const int in_y = in_y_origin + filter_y;
246 row_accum_func(stride_width, input_depth, input_width,
247 input_data + in_y * input_dims.strides[2] + b * input_dims.strides[3],
248 pad_width, depth_multiplier, filter_width,
249 filter_data + filter_y * filter_dims.strides[2], out_x_buffer_start,
250 out_x_buffer_end, output_depth, acc_buffer);
251 }
252 // Finished accumulating. Now store to destination.
253 const int num_output_values = output_depth * num_output_pixels;
254 int i = 0;
255 // Handle leftover values, one by one. This is very slow.
256 for (; i < num_output_values; i++)
257 {
258 float acc = acc_buffer[i];
260 {
261 acc = std::max(0.f, acc);
262 }
264 {
265 acc = std::max(0.f, std::min(6.f, acc));
266 }
268 {
269 acc = std::max(-1.f, std::min(1.f, acc));
270 }
271 *output_ptr++ = acc;
272 }
273 }
274 }
275 }
276}
int ArraySize(const Dims< N > &array, int index)
Definition Dims.h:76
#define DCHECK_GE(x, y)
Definition Macro.h:33
#define DCHECK_LE(x, y)
Definition Macro.h:41
#define DCHECK(condition)
Definition Macro.h:25
int MatchingArraySize(const ArrayType1 &array1, int index1, const ArrayType2 &array2, int index2)
Definition Array.h:31
void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, const float *bias_data, float *acc_buffer)
int strides[N]
Definition Dims.h:28

References ArraySize(), DCHECK, DCHECK_GE, DCHECK_LE, DepthwiseConvInitAccBuffer(), FloatDepthwiseConvAccumRowGeneric(), kNone, kRelu, kRelu1, kRelu6, MatchingArraySize(), and Dims< N >::strides.

◆ DepthwiseConvInitAccBuffer()

void optimized_ops::DepthwiseConvInitAccBuffer ( int  num_output_pixels,
int  output_depth,
const float *  bias_data,
float *  acc_buffer 
)
inline

Definition at line 154 of file DepthwiseConv2D.float.cpp.

156{
157 for (int i = 0; i < num_output_pixels; i++)
158 {
159 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
160 }
161}

Referenced by DepthwiseConv().

◆ FloatDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void optimized_ops::FloatDepthwiseConvAccumRow ( int  stride,
int  input_depth,
int  input_width,
const float *  input_data,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const float *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
float *  acc_buffer 
)

Definition at line 44 of file DepthwiseConv2D.float.cpp.

48{
49 // Sanity check parameters. This is important in particular to ensure
50 // that we keep the number of template instantiations minimal, so we don't
51 // increase binary size unnecessarily.
52 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
53 static_assert(kFixedInputDepth || kAllowStrided, "");
54 DCHECK(stride == 1 || kAllowStrided);
55 if (kFixedInputDepth)
56 {
57 DCHECK_EQ(input_depth, kFixedInputDepth);
58 }
59 if (kFixedDepthMultiplier)
60 {
61 DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
62 }
63 DCHECK_EQ(output_depth, input_depth * depth_multiplier);
64 const int input_ptr_increment = stride * input_depth;
65 const float *filter_base_ptr = filter_data;
66 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
67 {
68 // For the current (filter_x, filter_y) point in the filter,
69 // compute the boundaries of the corresponding output row segment.
70 int out_x_loop_start_unclampled = 0;
71 int out_x_loop_end_unclampled = 0;
72 if (kAllowStrided)
73 {
74 if (stride == 2)
75 {
76 out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
77 out_x_loop_end_unclampled = (pad_width + input_width - filter_x + 1) / 2;
78 }
79 else if (stride == 4)
80 {
81 out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
82 out_x_loop_end_unclampled = (pad_width + input_width - filter_x + 3) / 4;
83 }
84 else
85 {
86 out_x_loop_start_unclampled = (pad_width - filter_x + stride - 1) / stride;
87 out_x_loop_end_unclampled = (pad_width + input_width - filter_x + stride - 1) / stride;
88 }
89 }
90 else
91 {
92 out_x_loop_start_unclampled = pad_width - filter_x;
93 out_x_loop_end_unclampled = pad_width + input_width - filter_x;
94 }
95 // The kernel will have to iterate on the segment of the
96 // output row that starts at out_x_loop_start and out_x_loop_end.
97 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
98 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
99
100 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
101 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
102 const float *input_ptr = input_data + in_x_origin * input_depth;
103 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
104 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
105 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
106 filter_base_ptr, acc_buffer_ptr);
107 filter_base_ptr += output_depth;
108 }
109}
#define DCHECK_EQ(x, y)
Definition Macro.h:29
list input_data
Definition infer.py:29

References DCHECK, and DCHECK_EQ.

◆ FloatDepthwiseConvAccumRowGeneric()

void optimized_ops::FloatDepthwiseConvAccumRowGeneric ( int  stride,
int  input_depth,
int  input_width,
const float *  input_data,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const float *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
float *  acc_buffer 
)
inline

Definition at line 114 of file DepthwiseConv2D.float.cpp.

120{
121 const float *filter_base_ptr = filter_data;
122 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
123 {
124 const int out_x_loop_start =
125 std::max(out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
126 const int out_x_loop_end =
127 std::min(out_x_buffer_end, (pad_width + input_width - filter_x + stride - 1) / stride);
128
129 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
130 const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
131 const float *input_ptr = input_data + in_x_origin * input_depth;
132 const int input_ptr_increment = (stride - 1) * input_depth;
133 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
134 {
135 const float *filter_ptr = filter_base_ptr;
136 for (int ic = 0; ic < input_depth; ++ic)
137 {
138 const float input_val = *input_ptr++;
139 for (int m = 0; m < depth_multiplier; m++)
140 {
141 const float filter_val = *filter_ptr++;
142 *acc_buffer_ptr++ += filter_val * input_val;
143 }
144 }
145 input_ptr += input_ptr_increment;
146 }
147 filter_base_ptr += output_depth;
148 }
149}

References m.

Referenced by DepthwiseConv().