ONE - On-device Neural Engine
Loading...
Searching...
No Matches
DepthwiseConvUint8.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
19#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
20
21#include "cker/Shape.h"
22#include "cker/Types.h"
23#include "cker/Utils.h"
25
26#include <fixedpoint/fixedpoint.h>
27#include <public/gemmlowp.h>
28
29namespace nnfw
30{
31namespace cker
32{
33namespace optimized
34{
35namespace depthwise_conv
36{
37
38// Implementation of quantized DepthwiseConv
39
40template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
44
45#ifdef USE_NEON
46template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
47{
48 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
49 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
50 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
51 {
52 (void)input_depth;
53 (void)depth_multiplier;
54 // Load the filters, add filter_offset.
55 uint8x8x2_t filter_u8;
56 filter_u8.val[0] = vld1_u8(filter_ptr);
57 filter_u8.val[1] = vld1_u8(filter_ptr + 8);
58 int16x8_t filter[2];
59 for (int i = 0; i < 2; i++)
60 {
61 filter[i] =
62 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
63 }
64 // Handle one output pixel at a time.
65 for (int outp = 0; outp < num_output_pixels; outp++)
66 {
67 // Load the accumulators from acc_buffer
68 int32x4x2_t acc[2];
69 for (int i = 0; i < 2; i++)
70 {
71 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
72 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
73 }
74 // Load the inputs, add input_offset.
75 const uint8x8_t input_u8 = vld1_u8(input_ptr);
76 input_ptr += input_ptr_increment;
77 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
78 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
79 // Duplicate the input values, 2-fold
80 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
81 // Multiply-accumulate
82 for (int i = 0; i < 2; i++)
83 {
84 acc[0].val[i] =
85 vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
86 acc[1].val[i] =
87 vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
88 }
89 // Store the accumulators back to acc_buffer
90 for (int i = 0; i < 2; i++)
91 {
92 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
93 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
94 }
95 acc_buffer_ptr += 16;
96 }
97 }
98};
99
100template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
101{
102 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
103 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
104 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
105 {
106 (void)input_depth;
107 (void)depth_multiplier;
108 (void)input_ptr_increment;
109 // Load the filters, add filter_offset.
110 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
111 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
112 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
113
114 int outp = 0;
115 // Handle 2 output pixels at a time.
116 for (; outp <= num_output_pixels - 2; outp += 2)
117 {
118 // Load the accumulators from acc_buffer.
119 int32x4_t acc[4];
120 for (int i = 0; i < 4; i++)
121 {
122 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
123 }
124 // Load the inputs, add input_offset.
125 uint8x8_t input_u8[2];
126 for (int i = 0; i < 2; i++)
127 {
128 input_u8[i] = vld1_u8(input_ptr + 8 * i);
129 }
130 input_ptr += 16;
131 int16x8_t input[2];
132 for (int i = 0; i < 2; i++)
133 {
134 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
135 }
136 for (int i = 0; i < 2; i++)
137 {
138 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
139 }
140 // Multiply-accumulate.
141 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
142 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
143 acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
144 acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
145 // Store the accumulators back to acc_buffer
146 for (int i = 0; i < 4; i++)
147 {
148 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
149 }
150 acc_buffer_ptr += 16;
151 }
152 // Handle 1 output pixel at a time.
153 for (; outp < num_output_pixels; outp++)
154 {
155 // Load the accumulators from acc_buffer.
156 int32x4_t acc[2];
157 acc[0] = vld1q_s32(acc_buffer_ptr);
158 acc[1] = vld1q_s32(acc_buffer_ptr + 4);
159
160 // Load the inputs, add input_offset.
161 const uint8x8_t input_u8 = vld1_u8(input_ptr);
162 input_ptr += 8;
163 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
164 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
165 // Multiply-accumulate.
166 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
167 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
168 // Store the accumulators back to acc_buffer
169 vst1q_s32(acc_buffer_ptr, acc[0]);
170 vst1q_s32(acc_buffer_ptr + 4, acc[1]);
171 acc_buffer_ptr += 8;
172 }
173 }
174};
175
176template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
177{
178 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
179 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
180 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
181 {
182 (void)input_depth;
183 (void)depth_multiplier;
184 (void)input_ptr_increment;
185 // Load the filters, add filter_offset.
186 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
187 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
188 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
189
190 int outp = 0;
191 // Handle 2 output pixels at a time.
192 for (; outp <= num_output_pixels - 2; outp += 2)
193 {
194 // Load the accumulators from acc_buffer
195 int32x4_t acc[4];
196 for (int i = 0; i < 4; i++)
197 {
198 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
199 }
200 // Load the inputs, add input_offset.
201 const uint8x8_t input_u8 = vld1_u8(input_ptr);
202 input_ptr += 8;
203 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
204 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
205 // Duplicate the input values, 2-fold
206 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
207 // Multiply-accumulate
208 for (int i = 0; i < 2; i++)
209 {
210 acc[2 * i + 0] =
211 vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
212 acc[2 * i + 1] =
213 vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
214 }
215 // Store the accumulators back to acc_buffer
216 for (int i = 0; i < 4; i++)
217 {
218 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
219 }
220 acc_buffer_ptr += 16;
221 }
222 // Handle one output pixel at a time.
223 for (; outp < num_output_pixels; outp++)
224 {
225 // Load the accumulators from acc_buffer
226 int32x4_t acc[2];
227 for (int i = 0; i < 2; i++)
228 {
229 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
230 }
231 // Load the inputs, add input_offset.
232 uint8x8_t input_u8 = vdup_n_u8(0);
233 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
234 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
235 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
236 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
237 input_ptr += 4;
238 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
239 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
240 // Duplicate the input values, 2-fold
241 const int16x4x2_t input_dup2 = vzip_s16(input, input);
242 // Multiply-accumulate
243 acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
244 acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
245 // Store the accumulators back to acc_buffer
246 for (int i = 0; i < 2; i++)
247 {
248 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
249 }
250 acc_buffer_ptr += 8;
251 }
252 }
253};
254
255template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
256{
257 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
258 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
259 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
260 {
261 (void)input_depth;
262 (void)depth_multiplier;
263 (void)input_ptr_increment;
264 // Load the filters, add filter_offset.
265 int16x8_t filter[2];
266 for (int i = 0; i < 2; i++)
267 {
268 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
269 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
270 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
271 }
272 int outp = 0;
273 // Handle two output pixels at a time.
274 for (; outp <= num_output_pixels - 2; outp += 2)
275 {
276 // Load the accumulators from acc_buffer.
277 int32x4_t acc[8];
278 for (int i = 0; i < 8; i++)
279 {
280 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
281 }
282 // Load the inputs, add input_offset.
283 uint8x8_t input_u8 = vdup_n_u8(0);
284 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
285 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
286 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
287 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
288 input_ptr += 4;
289 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
290 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
291 // Multiply-accumulate.
292 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
293 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
294 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
295 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
296 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
297 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
298 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
299 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
300 // Store the accumulators back to acc_buffer.
301 for (int i = 0; i < 8; i++)
302 {
303 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
304 }
305 acc_buffer_ptr += 32;
306 }
307 // Handle one output pixel at a time.
308 for (; outp < num_output_pixels; outp++)
309 {
310 // Load the accumulators from acc_buffer.
311 int32x4_t acc[4];
312 for (int i = 0; i < 4; i++)
313 {
314 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
315 }
316 // Load the inputs, add input_offset.
317 uint8x8_t input_u8 = vdup_n_u8(0);
318 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
319 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
320 input_ptr += 2;
321 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
322 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
323
324 // Multiply-accumulate.
325 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
326 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
327 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
328 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
329
330 // Store the accumulators back to acc_buffer.
331 for (int i = 0; i < 4; i++)
332 {
333 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
334 }
335 acc_buffer_ptr += 16;
336 }
337 }
338};
339
340template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
341{
342 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
343 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
344 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
345 {
346 (void)input_depth;
347 (void)depth_multiplier;
348 (void)input_ptr_increment;
349 // Load the filters, add filter_offset.
350 uint8x8_t filter_u8 = vdup_n_u8(0);
351 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
352 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
353 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
354 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
355 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
356 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
357
358 int outp = 0;
359 // Handle 4 output pixels at a time.
360 for (; outp <= num_output_pixels - 4; outp += 4)
361 {
362 // Load the accumulators from acc_buffer
363 int32x4_t acc[4];
364 for (int i = 0; i < 4; i++)
365 {
366 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
367 }
368
369 // Load the inputs, add input_offset.
370 const uint8x8_t input_u8 = vld1_u8(input_ptr);
371 input_ptr += 8;
372 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
373 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
374 // Duplicate the input values, 2-fold
375 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
376 // Multiply-accumulate
377 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
378 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
379 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
380 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
381 // Store the accumulators back to acc_buffer
382 for (int i = 0; i < 4; i++)
383 {
384 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
385 }
386 acc_buffer_ptr += 16;
387 }
388 // Handle one output pixel at a time.
389 for (; outp < num_output_pixels; outp++)
390 {
391 // Load the accumulators from acc_buffer
392 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
393
394 uint8x8_t input_u8 = vdup_n_u8(0);
395 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
396 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
397 input_ptr += 2;
398 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
399 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
400 // Duplicate the input values, 2-fold
401 const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
402 // Multiply-accumulate
403 acc = vmlal_s16(acc, filter, input_dup2);
404 // Store the accumulators back to acc_buffer
405 vst1q_s32(acc_buffer_ptr, acc);
406 acc_buffer_ptr += 4;
407 }
408 }
409};
410
411template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
412{
413 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
414 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
415 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
416 {
417 (void)input_depth;
418 (void)depth_multiplier;
419 (void)input_ptr_increment;
420 // Load the filters, add filter_offset.
421 uint8x8_t filter_u8 = vdup_n_u8(0);
422 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
423 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
424 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
425 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
426 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
427 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
428
429 int outp = 0;
430 // Handle 8 output pixels at a time.
431 for (; outp <= num_output_pixels - 8; outp += 8)
432 {
433 // Load the accumulators from acc_buffer.
434 int32x4_t acc[4];
435 for (int i = 0; i < 4; i++)
436 {
437 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
438 }
439 // Load the inputs, add input_offset.
440 uint8x8_t input_u8[2];
441 for (int i = 0; i < 2; i++)
442 {
443 input_u8[i] = vld1_u8(input_ptr + 8 * i);
444 }
445 input_ptr += 16;
446 int16x8_t input[2];
447 for (int i = 0; i < 2; i++)
448 {
449 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
450 }
451 for (int i = 0; i < 2; i++)
452 {
453 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
454 }
455
456 // Multiply-accumulate.
457 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
458 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
459 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
460 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
461 // Store the accumulators back to acc_buffer.
462 for (int i = 0; i < 4; i++)
463 {
464 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
465 }
466 acc_buffer_ptr += 16;
467 }
468 // Handle 4 output pixels at a time.
469 for (; outp <= num_output_pixels - 4; outp += 4)
470 {
471 // Load the accumulators from acc_buffer.
472 int32x4_t acc[2];
473 for (int i = 0; i < 2; i++)
474 {
475 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
476 }
477 // Load the inputs, add input_offset.
478 const uint8x8_t input_u8 = vld1_u8(input_ptr);
479 input_ptr += 8;
480 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
481 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
482
483 // Multiply-accumulate.
484 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
485 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
486 // Store the accumulators back to acc_buffer.
487 for (int i = 0; i < 2; i++)
488 {
489 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
490 }
491 acc_buffer_ptr += 8;
492 }
493 // Handle 2 output pixels at a time.
494 for (; outp <= num_output_pixels - 2; outp += 2)
495 {
496 // Load the accumulators from acc_buffer.
497 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
498 // Load the inputs, add input_offset.
499 uint8x8_t input_u8 = vdup_n_u8(0);
500 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
501 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
502 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
503 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
504 input_ptr += 4;
505 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
506 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
507
508 // Multiply-accumulate.
509 acc = vmlal_s16(acc, filter, input);
510 // Store the accumulators back to acc_buffer.
511 vst1q_s32(acc_buffer_ptr, acc);
512 acc_buffer_ptr += 4;
513 }
514 // Handle 1 output pixel at a time.
515 for (; outp < num_output_pixels; outp++)
516 {
517 // Load the accumulators from acc_buffer.
518 int32x2_t acc = vld1_s32(acc_buffer_ptr);
519 // Load the inputs, add input_offset.
520 uint8x8_t input_u8 = vdup_n_u8(0);
521 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
522 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
523 input_ptr += 2;
524 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
525 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
526
527 // Multiply-accumulate.
528 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
529 // Store the accumulators back to acc_buffer.
530 vst1_s32(acc_buffer_ptr, acc);
531 acc_buffer_ptr += 2;
532 }
533 }
534};
535
536template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
537{
538 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
539 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
540 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
541 {
542 (void)input_depth;
543 (void)depth_multiplier;
544 (void)input_ptr_increment;
545 // Load the filters, add filter_offset.
546 uint8x8_t filter_u8 = vdup_n_u8(0);
547 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
548 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
549 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
550 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
551 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
552 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
553
554 int outp = 0;
555 // Handle 8 output pixels at a time.
556 for (; outp <= num_output_pixels - 8; outp += 8)
557 {
558 // Load the accumulators from acc_buffer
559 int32x4_t acc[4];
560 for (int i = 0; i < 4; i++)
561 {
562 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
563 }
564
565 // Load the inputs, add input_offset.
566 const uint8x8_t input_u8 = vld1_u8(input_ptr);
567 input_ptr += 8;
568 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
569 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
570 // Duplicate the input values, 2-fold
571 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
572 // Multiply-accumulate
573 acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
574 acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
575 acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
576 acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
577 // Store the accumulators back to acc_buffer
578 for (int i = 0; i < 4; i++)
579 {
580 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
581 }
582 acc_buffer_ptr += 16;
583 }
584 // Handle one output pixel at a time.
585 for (; outp < num_output_pixels; outp++)
586 {
587 // Load the accumulators from acc_buffer
588 int32x2_t acc = vld1_s32(acc_buffer_ptr);
589
590 // Load the inputs, add input_offset.
591 const uint32_t input = *input_ptr++ + input_offset;
592
593 // Multiply-accumulate
594 acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
595 // Store the accumulators back to acc_buffer
596 vst1_s32(acc_buffer_ptr, acc);
597 acc_buffer_ptr += 2;
598 }
599 }
600};
601
602template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
603{
604 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
605 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
606 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
607 {
608 (void)input_depth;
609 (void)depth_multiplier;
610 (void)input_ptr_increment;
611 // Load the filters, add filter_offset.
612 uint8x8_t filter_u8 = vdup_n_u8(0);
613 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
614 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
615 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
616 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
617 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
618 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
619
620 int outp = 0;
621 // Handle 8 output pixels at a time.
622 for (; outp <= num_output_pixels - 8; outp += 8)
623 {
624 // Load the accumulators from acc_buffer
625 int32x4_t acc[8];
626 for (int i = 0; i < 8; i++)
627 {
628 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
629 }
630
631 // Load the inputs, add input_offset.
632 uint8x8_t input_u8 = vld1_u8(input_ptr);
633 input_ptr += 8;
634 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
635 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
636
637 // Multiply-accumulate
638 acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
639 acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
640 acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
641 acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
642 acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
643 acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
644 acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
645 acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
646
647 // Store the accumulators back to acc_buffer
648 for (int i = 0; i < 8; i++)
649 {
650 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
651 }
652 acc_buffer_ptr += 32;
653 }
654 // Handle 4 output pixels at a time.
655 for (; outp <= num_output_pixels - 4; outp += 4)
656 {
657 // Load the accumulators from acc_buffer
658 int32x4_t acc[4];
659 for (int i = 0; i < 4; i++)
660 {
661 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
662 }
663
664 // Load the inputs, add input_offset.
665 uint8x8_t input_u8 = vdup_n_u8(0);
666 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
667 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
668 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
669 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
670 input_ptr += 4;
671 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
672 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
673
674 // Multiply-accumulate
675 acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
676 acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
677 acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
678 acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
679
680 // Store the accumulators back to acc_buffer
681 for (int i = 0; i < 4; i++)
682 {
683 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
684 }
685 acc_buffer_ptr += 16;
686 }
687 // Handle one output pixel at a time.
688 for (; outp < num_output_pixels; outp++)
689 {
690 // Load the accumulators from acc_buffer
691 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
692
693 // Load the inputs, add input_offset.
694 const uint32_t input = *input_ptr++ + input_offset;
695
696 // Multiply-accumulate
697 acc = vmlal_n_s16(acc, filter, input);
698 // Store the accumulators back to acc_buffer
699 vst1q_s32(acc_buffer_ptr, acc);
700 acc_buffer_ptr += 4;
701 }
702 }
703};
704
705template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
706{
707 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
708 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
709 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
710 {
711 (void)input_depth;
712 (void)depth_multiplier;
713 (void)input_ptr_increment;
714 // Load the filters, add filter_offset.
715 uint8x8_t filter_u8 = vdup_n_u8(0);
716 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
717 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
718 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
719 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
720 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
721 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
722
723 int outp = 0;
724 // Handle 4 output pixels at a time.
725 for (; outp <= num_output_pixels - 4; outp += 4)
726 {
727 // Load the accumulators from acc_buffer
728 int32x4_t acc[4];
729 for (int i = 0; i < 4; i++)
730 {
731 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
732 }
733 // Load the inputs, add input_offset.
734 int16x8_t input[2];
735 for (int i = 0; i < 2; i++)
736 {
737 const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
738 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
739 input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
740 }
741 input_ptr += 16;
742 // Multiply-accumulate
743 for (int i = 0; i < 2; i++)
744 {
745 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
746 acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
747 }
748 // Store the accumulators back to acc_buffer
749 for (int i = 0; i < 4; i++)
750 {
751 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
752 }
753 acc_buffer_ptr += 16;
754 }
755 // Handle one output pixel at a time.
756 for (; outp < num_output_pixels; outp++)
757 {
758 // Load the accumulators from acc_buffer
759 int32x4_t acc;
760 acc = vld1q_s32(acc_buffer_ptr);
761
762 // Load the inputs, add input_offset.
763 uint8x8_t input_u8 = vdup_n_u8(0);
764 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
765 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
766 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
767 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
768 input_ptr += 4;
769 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
770 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
771 // Multiply-accumulate
772 acc = vmlal_s16(acc, filter, input);
773 // Store the accumulators back to acc_buffer
774 vst1q_s32(acc_buffer_ptr, acc);
775 acc_buffer_ptr += 4;
776 }
777 }
778};
779
780template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
781{
782 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
783 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
784 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
785 {
786 (void)input_depth;
787 (void)depth_multiplier;
788 (void)input_ptr_increment;
789 // Load the filters, add filter_offset.
790 int16x8_t filter[2];
791 for (int i = 0; i < 2; i++)
792 {
793 const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
794 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
795 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
796 }
797
798 int outp = 0;
799 // Handle 2 output pixels at a time.
800 for (; outp <= num_output_pixels - 2; outp += 2)
801 {
802 // Load the accumulators from acc_buffer
803 int32x4_t acc[8];
804 for (int i = 0; i < 8; i++)
805 {
806 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
807 }
808
809 // Load the inputs, add input_offset.
810 uint8x8_t input_u8 = vld1_u8(input_ptr);
811 input_ptr += 8;
812 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
813 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
814
815 // Multiply-accumulate
816 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
817 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
818 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
819 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
820 acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
821 acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
822 acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
823 acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
824 // Store the accumulators back to acc_buffer
825 for (int i = 0; i < 8; i++)
826 {
827 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
828 }
829 acc_buffer_ptr += 32;
830 }
831 // Handle one output pixel at a time.
832 for (; outp < num_output_pixels; outp++)
833 {
834 // Load the accumulators from acc_buffer
835 int32x4_t acc[4];
836 for (int i = 0; i < 4; i++)
837 {
838 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
839 }
840
841 // Load the inputs, add input_offset.
842 uint8x8_t input_u8 = vdup_n_u8(0);
843 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
844 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
845 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
846 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
847 input_ptr += 4;
848 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
849 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
850
851 // Multiply-accumulate
852 acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
853 acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
854 acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
855 acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
856 // Store the accumulators back to acc_buffer
857 for (int i = 0; i < 4; i++)
858 {
859 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
860 }
861 acc_buffer_ptr += 16;
862 }
863 }
864};
865
866template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
867{
868 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
869 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
870 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
871 {
872 (void)input_depth;
873 (void)depth_multiplier;
874 // We will have to duplicate bytes in a NEON register, 3-fold.
875 // We will do that by register-level table-look-up using VTBL instructions.
876 // Here we prepare the registers containing the table-lookup indices.
877 static const uint8_t dup3_indices_array[3][8] = {
878 {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
879 uint8x8_t dup3_indices[3];
880 for (int i = 0; i < 3; i++)
881 {
882 dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
883 }
884
885 // Handle one output pixel at a time.
886 for (int outp = 0; outp < num_output_pixels; outp++)
887 {
888 const uint8_t *local_filter_ptr = filter_ptr;
889 const uint8_t *local_input_ptr = input_ptr;
890 int ic = 0;
891 // Handle 8 input channels at a time.
892 for (; ic <= input_depth - 8; ic += 8)
893 {
894 // Load the filters, add filter_offset.
895 int16x8_t filter[3];
896 uint8x8x3_t filter_u8;
897 filter_u8.val[0] = vld1_u8(local_filter_ptr);
898 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
899 filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
900 local_filter_ptr += 24;
901 for (int i = 0; i < 3; i++)
902 {
903 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
904 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
905 }
906 // Load the inputs, duplicate 3-fold, add input_offset.
907 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
908 local_input_ptr += 8;
909
910 uint8x8_t input_u8_dup3[3];
911 for (int i = 0; i < 3; i++)
912 {
913 input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
914 }
915 int16x8_t input_dup3[3];
916 for (int i = 0; i < 3; i++)
917 {
918 const int16x8_t input_s16_dup3 = vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
919 input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
920 }
921 // Load the accumulators from acc_buffer
922 int32x4x3_t acc[2];
923 for (int i = 0; i < 2; i++)
924 {
925 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
926 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
927 acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
928 }
929 // Multiply-accumulate
930 for (int j = 0; j < 3; j++)
931 {
932 acc[0].val[j] =
933 vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
934 acc[1].val[j] =
935 vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
936 }
937 // Store the accumulators back to acc_buffer
938 for (int i = 0; i < 2; i++)
939 {
940 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
941 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
942 vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
943 }
944 acc_buffer_ptr += 24;
945 }
946 // Handle one input channel at a time.
947 for (; ic < input_depth; ic++)
948 {
949 const int16_t input_val = *local_input_ptr++ + input_offset;
950 for (int i = 0; i < 3; i++)
951 {
952 const int16_t filter_val = local_filter_ptr[i] + filter_offset;
953 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
954 }
955 local_filter_ptr += 3;
956 }
957 input_ptr += input_ptr_increment;
958 }
959 }
960};
961
962template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
963{
964 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
965 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
966 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
967 {
968 (void)input_depth;
969 (void)depth_multiplier;
970 // Handle one output pixel at a time.
971 for (int outp = 0; outp < num_output_pixels; outp++)
972 {
973 const uint8_t *local_filter_ptr = filter_ptr;
974 const uint8_t *local_input_ptr = input_ptr;
975 int ic = 0;
976 // Handle 8 input channels at a time.
977 for (; ic <= input_depth - 8; ic += 8)
978 {
979 // Load the filters, add filter_offset.
980 int16x8_t filter[2];
981 uint8x8x2_t filter_u8;
982 filter_u8.val[0] = vld1_u8(local_filter_ptr);
983 filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
984 local_filter_ptr += 16;
985 for (int i = 0; i < 2; i++)
986 {
987 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
988 filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
989 }
990 // Load the inputs, add input_offset, duplicate 2-fold.
991 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
992 local_input_ptr += 8;
993 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
994 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
995 const int16x8x2_t input_dup2 = vzipq_s16(input, input);
996 // Load the accumulators from acc_buffer.
997 int32x4x2_t acc[2];
998 for (int i = 0; i < 2; i++)
999 {
1000 acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
1001 acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
1002 }
1003 // Multiply-accumulate.
1004 for (int j = 0; j < 2; j++)
1005 {
1006 acc[0].val[j] =
1007 vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
1008 acc[1].val[j] =
1009 vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
1010 }
1011 // Store the accumulators back to acc_buffer.
1012 for (int i = 0; i < 2; i++)
1013 {
1014 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
1015 vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
1016 }
1017 acc_buffer_ptr += 16;
1018 }
1019 // Handle one input channel at a time.
1020 for (; ic < input_depth; ic++)
1021 {
1022 // Load the inputs.
1023 const int16_t input_val = *local_input_ptr++ + input_offset;
1024 for (int i = 0; i < 2; i++)
1025 {
1026 const int16_t filter_val = local_filter_ptr[i] + filter_offset;
1027 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1028 }
1029 local_filter_ptr += 2;
1030 }
1031 input_ptr += input_ptr_increment;
1032 }
1033 }
1034};
1035
1036template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
1037{
1038 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1039 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1040 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1041 {
1042 (void)input_depth;
1043 (void)depth_multiplier;
1044 // Handle one output pixel at a time.
1045 for (int outp = 0; outp < num_output_pixels; outp++)
1046 {
1047 const uint8_t *local_filter_ptr = filter_ptr;
1048 const uint8_t *local_input_ptr = input_ptr;
1049 int ic = 0;
1050 // Handle 16 input channels at a time.
1051 for (; ic <= input_depth - 16; ic += 16)
1052 {
1053 // Load the filters, add filter_offset.
1054 uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
1055 uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
1056 local_filter_ptr += 16;
1057 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1058 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1059 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1060 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1061 // Load the inputs, add input_offset.
1062 uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
1063 uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
1064 local_input_ptr += 16;
1065 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1066 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1067 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1068 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1069 // Load the accumulators from acc_buffer
1070 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1071 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1072 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1073 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1074 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
1075 acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
1076 acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
1077 acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
1078 // Store the accumulators back to acc_buffer
1079 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1080 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1081 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1082 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1083 acc_buffer_ptr += 16;
1084 }
1085 // Handle 8 input channels at a time.
1086 for (; ic <= input_depth - 8; ic += 8)
1087 {
1088 // Load the filters, add filter_offset.
1089 const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
1090 local_filter_ptr += 8;
1091 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1092 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1093 // Load the inputs, add input_offset.
1094 const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
1095 local_input_ptr += 8;
1096 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1097 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1098 // Load the accumulators from acc_buffer
1099 int32x4_t acc[2];
1100 for (int i = 0; i < 2; i++)
1101 {
1102 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1103 }
1104 // Multiply-accumulate
1105 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1106 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1107 // Store the accumulators back to acc_buffer
1108 for (int i = 0; i < 2; i++)
1109 {
1110 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1111 }
1112 acc_buffer_ptr += 8;
1113 }
1114 // Handle one input channel at a time.
1115 for (; ic < input_depth; ic++)
1116 {
1117 const int16_t input_val = *local_input_ptr++ + input_offset;
1118 const int16_t filter_val = *local_filter_ptr++ + filter_offset;
1119 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1120 }
1121 input_ptr += input_ptr_increment;
1122 }
1123 }
1124};
1125
1126template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
1127{
1128 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1129 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1130 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1131 {
1132 (void)input_depth;
1133 (void)depth_multiplier;
1134 // Load the filters, add filter_offset.
1135 uint8x8_t filter_u8[2];
1136 for (int i = 0; i < 2; i++)
1137 {
1138 filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1139 }
1140 int16x8_t filter[2];
1141 for (int i = 0; i < 2; i++)
1142 {
1143 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1144 }
1145 for (int i = 0; i < 2; i++)
1146 {
1147 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1148 }
1149 // Handle one output pixel at a time.
1150 for (int outp = 0; outp < num_output_pixels; outp++)
1151 {
1152 // Load the inputs, add input_offset.
1153 uint8x8_t input_u8[2];
1154 for (int i = 0; i < 2; i++)
1155 {
1156 input_u8[i] = vld1_u8(input_ptr + 8 * i);
1157 }
1158 input_ptr += input_ptr_increment;
1159 int16x8_t input[2];
1160 for (int i = 0; i < 2; i++)
1161 {
1162 input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
1163 }
1164 for (int i = 0; i < 2; i++)
1165 {
1166 input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
1167 }
1168 // Load the accumulators from acc_buffer
1169 int32x4_t acc[4];
1170 for (int i = 0; i < 4; i++)
1171 {
1172 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1173 }
1174 // Multiply-accumulate
1175 for (int i = 0; i < 2; i++)
1176 {
1177 acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
1178 acc[2 * i + 1] =
1179 vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
1180 }
1181 // Store the accumulators back to acc_buffer
1182 for (int i = 0; i < 4; i++)
1183 {
1184 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1185 }
1186 acc_buffer_ptr += 16;
1187 }
1188 }
1189};
1190
1191template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
1192{
1193 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1194 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1195 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1196 {
1197 (void)input_depth;
1198 (void)depth_multiplier;
1199 // Load the filters, add filter_offset.
1200 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1201 const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1202 const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1203 // Handle one output pixel at a time.
1204 for (int outp = 0; outp < num_output_pixels; outp++)
1205 {
1206 // Load the inputs, add input_offset.
1207 const uint8x8_t input_u8 = vld1_u8(input_ptr);
1208 const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1209 const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1210 // Load the accumulators from acc_buffer
1211 int32x4_t acc[2];
1212 for (int i = 0; i < 2; i++)
1213 {
1214 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1215 }
1216 // Multiply-accumulate
1217 acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
1218 acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
1219 // Store the accumulators back to acc_buffer
1220 for (int i = 0; i < 2; i++)
1221 {
1222 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1223 }
1224 acc_buffer_ptr += 8;
1225 input_ptr += input_ptr_increment;
1226 }
1227 }
1228};
1229
1230template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
1231{
1232 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1233 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1234 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1235 {
1236 (void)input_depth;
1237 (void)depth_multiplier;
1238 // Load the filters, add filter_offset.
1239 uint8x8_t filter_u8[2];
1240 for (int i = 0; i < 2; i++)
1241 {
1242 filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
1243 }
1244 int16x8_t filter[2];
1245 for (int i = 0; i < 2; i++)
1246 {
1247 filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
1248 }
1249 for (int i = 0; i < 2; i++)
1250 {
1251 filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
1252 }
1253 // Handle one output pixel at a time.
1254 for (int outp = 0; outp < num_output_pixels; outp++)
1255 {
1256 uint8_t input_u8 = *input_ptr;
1257 input_ptr += input_ptr_increment;
1258 int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1259 // Load the accumulators from acc_buffer
1260 int32x4_t acc[4];
1261 for (int i = 0; i < 4; i++)
1262 {
1263 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1264 }
1265 // Multiply-accumulate
1266 for (int i = 0; i < 2; i++)
1267 {
1268 acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
1269 acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
1270 }
1271 // Store the accumulators back to acc_buffer
1272 for (int i = 0; i < 4; i++)
1273 {
1274 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1275 }
1276 acc_buffer_ptr += 16;
1277 }
1278 }
1279};
1280
1281template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
1282{
1283 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1284 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1285 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1286 {
1287 (void)input_depth;
1288 (void)depth_multiplier;
1289 // Load the filters, add filter_offset.
1290 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1291 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1292 uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
1293 uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
1294 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1295 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1296 int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
1297 int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
1298 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1299 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1300 filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
1301 filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
1302 // Handle one output pixel at a time.
1303 for (int outp = 0; outp < num_output_pixels; outp++)
1304 {
1305 uint8_t input_u8 = *input_ptr;
1306 input_ptr += input_ptr_increment;
1307 int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1308 // Load the accumulators from acc_buffer
1309 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1310 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1311 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1312 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1313 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1314 int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
1315 int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
1316 int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
1317 // Multiply-accumulate
1318 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1319 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1320 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1321 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1322 acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
1323 acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
1324 acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
1325 acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
1326 // Store the accumulators back to acc_buffer
1327 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1328 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1329 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1330 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1331 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1332 vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
1333 vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
1334 vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
1335 acc_buffer_ptr += 32;
1336 }
1337 }
1338};
1339
1340template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
1341{
1342 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1343 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1344 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1345 {
1346 (void)input_depth;
1347 (void)depth_multiplier;
1348 // Load the filters, add filter_offset.
1349 // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
1350 // We load the first 16 bytes into filter_u8_{0,1} as usual.
1351 // Then we load the 8 last bytes into filter_u8_x (x for 'extra').
1352 // This is redundant: the first 4 bytes of filter_u8_x are the same
1353 // as the last 4 bytes of filter_u8_x.
1354 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
1355 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
1356 uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
1357 int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1358 int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1359 int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
1360 filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
1361 filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
1362 filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
1363 // Handle one output pixel at a time.
1364 for (int outp = 0; outp < num_output_pixels; outp++)
1365 {
1366 uint8_t input_u8 = *input_ptr;
1367 input_ptr += input_ptr_increment;
1368 int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1369 // Load the accumulators from acc_buffer
1370 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1371 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1372 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1373 int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
1374 int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
1375 // Multiply-accumulate
1376 acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
1377 acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
1378 acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
1379 acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
1380 acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
1381 // Store the accumulators back to acc_buffer
1382 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1383 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1384 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1385 vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
1386 vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
1387 acc_buffer_ptr += 20;
1388 }
1389 }
1390};
1391
1392template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
1393{
1394 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1395 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1396 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1397 {
1398 (void)input_depth;
1399 (void)depth_multiplier;
1400 // Load the filters, add filter_offset.
1401 const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1402 const int16x8_t filter =
1403 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
1404 // Handle one output pixel at a time.
1405 for (int outp = 0; outp < num_output_pixels; outp++)
1406 {
1407 uint8_t input_u8 = *input_ptr;
1408 input_ptr += input_ptr_increment;
1409 int16_t input = static_cast<int16_t>(input_u8) + input_offset;
1410 // Load the accumulators from acc_buffer
1411 int32x4_t acc[2];
1412 for (int i = 0; i < 2; i++)
1413 {
1414 acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1415 }
1416 // Multiply-accumulate
1417 acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
1418 acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
1419 // Store the accumulators back to acc_buffer
1420 for (int i = 0; i < 2; i++)
1421 {
1422 vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1423 }
1424 acc_buffer_ptr += 8;
1425 }
1426 }
1427};
1428
1429template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
1430{
1431 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1432 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1433 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1434 {
1435 (void)input_depth;
1436 (void)depth_multiplier;
1437 // Load the filters, add filter_offset.
1438 uint8x8_t filter_u8 = vdup_n_u8(0);
1439 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1440 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1441 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
1442 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
1443 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1444 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1445
1446 int outp = 0;
1447
1448 // Handle 2 output pixels at a time.
1449 for (; outp <= num_output_pixels - 2; outp += 2)
1450 {
1451 // Load the accumulators from acc_buffer.
1452 int32x4_t acc = vld1q_s32(acc_buffer_ptr);
1453 // Load the inputs, add input_offset.
1454 uint16x4_t input_u16 = vdup_n_u16(0);
1455 input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 0);
1456 input_ptr += input_ptr_increment;
1457 input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
1458 input_ptr += input_ptr_increment;
1459 const int16x4_t input_s16 =
1460 vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
1461 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1462
1463 // Multiply-accumulate.
1464 acc = vmlal_s16(acc, filter, input);
1465 // Store the accumulators back to acc_buffer.
1466 vst1q_s32(acc_buffer_ptr, acc);
1467 acc_buffer_ptr += 4;
1468 }
1469
1470 // Handle 1 output pixel at a time.
1471 for (; outp < num_output_pixels; outp++)
1472 {
1473 // Load the accumulators from acc_buffer.
1474 int32x2_t acc = vld1_s32(acc_buffer_ptr);
1475 // Load the inputs, add input_offset.
1476 uint8x8_t input_u8 = vdup_n_u8(0);
1477 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1478 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1479 input_ptr += input_ptr_increment;
1480 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1481 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1482
1483 // Multiply-accumulate.
1484 acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
1485 // Store the accumulators back to acc_buffer.
1486 vst1_s32(acc_buffer_ptr, acc);
1487 acc_buffer_ptr += 2;
1488 }
1489 }
1490};
1491
1492template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
1493{
1494 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1495 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1496 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1497 {
1498 (void)input_depth;
1499 (void)depth_multiplier;
1500 if (num_output_pixels <= 0)
1501 {
1502 return;
1503 }
1504
1505 // Load the filters, add filter_offset.
1506 uint8x8_t filter_u8 = vdup_n_u8(0);
1507 filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
1508 filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
1509 filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
1510 filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
1511 const int16x4_t filter_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
1512 const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
1513
1514 int outp = 0;
1515
1516 // Handle one output pixel at a time until second to the last pixel. Second
1517 // to the last because we read eight input pixels while only processing
1518 // four.
1519 for (; outp < num_output_pixels - 1; outp++)
1520 {
1521 // Load the accumulators from acc_buffer
1522 int32x4_t acc;
1523 acc = vld1q_s32(acc_buffer_ptr);
1524
1525 // Load the inputs, add input_offset.
1526 uint8x8_t input_u8 = vld1_u8(input_ptr);
1527 input_ptr += input_ptr_increment;
1528 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1529 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1530 // Multiply-accumulate
1531 acc = vmlal_s16(acc, filter, input);
1532 // Store the accumulators back to acc_buffer
1533 vst1q_s32(acc_buffer_ptr, acc);
1534 acc_buffer_ptr += 4;
1535 }
1536
1537 // Handle the last output pixel.
1538 // Load the accumulators from acc_buffer
1539 int32x4_t acc;
1540 acc = vld1q_s32(acc_buffer_ptr);
1541
1542 // Load the inputs, add input_offset.
1543 uint8x8_t input_u8 = vdup_n_u8(0);
1544 input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1545 input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1546 input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
1547 input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
1548 const int16x4_t input_s16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
1549 const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
1550 // Multiply-accumulate
1551 acc = vmlal_s16(acc, filter, input);
1552 // Store the accumulators back to acc_buffer
1553 vst1q_s32(acc_buffer_ptr, acc);
1554 }
1555};
1556
1557template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
1558{
1559 static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1560 const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
1561 const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
1562 {
1563 (void)input_depth;
1564 (void)depth_multiplier;
1565 // Load the filters, add filter_offset.
1566 uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
1567 uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
1568 int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
1569 int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
1570 filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
1571 filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
1572 int16x4_t filter_0 = vget_low_s16(filter_s16_0);
1573 int16x4_t filter_1 = vget_high_s16(filter_s16_0);
1574 int16x4_t filter_2 = vget_high_s16(filter_s16_1);
1575
1576 // Handle one output pixel at a time.
1577 for (int outp = 0; outp < num_output_pixels; outp++)
1578 {
1579 // Load the inputs, add input_offset.
1580 uint8x8_t input_u8_0 = vld1_u8(input_ptr);
1581 uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
1582 input_ptr += input_ptr_increment;
1583 int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
1584 int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
1585 input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
1586 input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
1587
1588 // Load the accumulators from acc_buffer
1589 int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
1590 int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
1591 int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
1592
1593 // Multiply-accumulate
1594 acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
1595 acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
1596 acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
1597
1598 // Store the accumulators back to acc_buffer
1599 vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
1600 vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
1601 vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
1602
1603 acc_buffer_ptr += 12;
1604 }
1605 }
1606};
1607#endif
1608
1609// Accumulates the effect of one row of the filter, on a segment of one row
1610// of the output, accessing the corresponding one row of the input.
1611template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
1612void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
1613 int input_width, const uint8_t *input_data,
1614 int16_t input_offset, int pad_width, int depth_multiplier,
1615 int filter_width, const uint8_t *filter_data,
1616 int16_t filter_offset, int out_x_buffer_start,
1617 int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
1618{
1619 // Sanity check parameters. This is important in particular to ensure
1620 // that we keep the number of template instantiations minimal, so we don't
1621 // increase binary size unnecessarily.
1622 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1623 static_assert(kFixedInputDepth || kAllowStrided, "");
1624 assert(stride == 1 || kAllowStrided);
1625 if (kFixedInputDepth)
1626 {
1627 assert(input_depth == kFixedInputDepth);
1628 }
1629 if (kFixedDepthMultiplier)
1630 {
1631 assert(depth_multiplier == kFixedDepthMultiplier);
1632 }
1633 assert(output_depth == input_depth * depth_multiplier);
1634 const int input_ptr_increment = stride * input_depth;
1635 const uint8_t *filter_base_ptr = filter_data;
1636 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1637 {
1638 // For the current (filter_x, filter_y) point in the filter,
1639 // compute the boundaries of the corresponding output row segment.
1640 int out_x_loop_start_unclampled = 0;
1641 int out_x_loop_end_unclampled = 0;
1642 if (kAllowStrided)
1643 {
1644 if (stride == 2)
1645 {
1646 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
1647 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1648 }
1649 else if (stride == 4)
1650 {
1651 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
1652 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1653 }
1654 else
1655 {
1656 out_x_loop_start_unclampled =
1657 (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1658 out_x_loop_end_unclampled =
1659 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1660 }
1661 }
1662 else
1663 {
1664 out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
1665 out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
1666 }
1667 // The kernel will have to iterate on the segment of the
1668 // output row that starts at out_x_loop_start and out_x_loop_end.
1669 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
1670 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
1671
1672 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1673 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1674 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1675 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1677 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1678 input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
1679 filter_base_ptr += output_depth;
1680 }
1681}
1682
1683// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
1684inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
1685 int input_width, const uint8_t *input_data,
1686 int16_t input_offset, int pad_width,
1687 int depth_multiplier, int filter_width,
1688 const uint8_t *filter_data, int16_t filter_offset,
1689 int out_x_buffer_start, int out_x_buffer_end,
1690 int output_depth, int32_t *acc_buffer)
1691{
1692 const uint8_t *filter_base_ptr = filter_data;
1693 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1694 {
1695 const int out_x_loop_start =
1696 std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1697 const int out_x_loop_end =
1698 std::min(out_x_buffer_end,
1699 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1700
1701 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1702 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1703 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1704 const int input_ptr_increment = (stride - 1) * input_depth;
1705 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1706 {
1707 const uint8_t *filter_ptr = filter_base_ptr;
1708 for (int ic = 0; ic < input_depth; ++ic)
1709 {
1710 const int16_t input_val = *input_ptr++ + input_offset;
1711 for (int m = 0; m < depth_multiplier; m++)
1712 {
1713 const int16_t filter_val = *filter_ptr++ + filter_offset;
1714 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1715 }
1716 }
1717 input_ptr += input_ptr_increment;
1718 }
1719 filter_base_ptr += output_depth;
1720 }
1721}
1722
1723// Initializes the accumulator buffer with bias values.
1724inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
1725 const int32_t *bias_data, int32_t *acc_buffer)
1726{
1727 int i = 0;
1728#ifdef USE_NEON
1729 if (output_depth == 1)
1730 {
1731 const int32x4_t b = vdupq_n_s32(bias_data[0]);
1732 for (; i <= num_output_pixels - 16; i += 16)
1733 {
1734 vst1q_s32(acc_buffer + i + 0, b);
1735 vst1q_s32(acc_buffer + i + 4, b);
1736 vst1q_s32(acc_buffer + i + 8, b);
1737 vst1q_s32(acc_buffer + i + 12, b);
1738 }
1739 for (; i <= num_output_pixels - 4; i += 4)
1740 {
1741 vst1q_s32(acc_buffer + i, b);
1742 }
1743 }
1744 else if (output_depth == 2)
1745 {
1746 int32x4_t b = vdupq_n_s32(bias_data[0]);
1747 b = vsetq_lane_s32(bias_data[1], b, 1);
1748 b = vsetq_lane_s32(bias_data[1], b, 3);
1749 for (; i <= num_output_pixels - 8; i += 8)
1750 {
1751 vst1q_s32(acc_buffer + 2 * i + 0, b);
1752 vst1q_s32(acc_buffer + 2 * i + 4, b);
1753 vst1q_s32(acc_buffer + 2 * i + 8, b);
1754 vst1q_s32(acc_buffer + 2 * i + 12, b);
1755 }
1756 for (; i <= num_output_pixels - 2; i += 2)
1757 {
1758 vst1q_s32(acc_buffer + 2 * i, b);
1759 }
1760 }
1761 else if (output_depth == 4)
1762 {
1763 const int32x4_t b = vld1q_s32(bias_data);
1764 for (; i <= num_output_pixels - 4; i += 4)
1765 {
1766 vst1q_s32(acc_buffer + 4 * i + 0, b);
1767 vst1q_s32(acc_buffer + 4 * i + 4, b);
1768 vst1q_s32(acc_buffer + 4 * i + 8, b);
1769 vst1q_s32(acc_buffer + 4 * i + 12, b);
1770 }
1771 for (; i < num_output_pixels; i++)
1772 {
1773 vst1q_s32(acc_buffer + 4 * i, b);
1774 }
1775 }
1776 else if (output_depth == 8)
1777 {
1778 const int32x4_t b0 = vld1q_s32(bias_data);
1779 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1780 for (; i <= num_output_pixels - 2; i += 2)
1781 {
1782 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1783 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1784 vst1q_s32(acc_buffer + 8 * i + 8, b0);
1785 vst1q_s32(acc_buffer + 8 * i + 12, b1);
1786 }
1787 for (; i < num_output_pixels; i++)
1788 {
1789 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1790 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1791 }
1792 }
1793 else if (output_depth == 16)
1794 {
1795 const int32x4_t b0 = vld1q_s32(bias_data);
1796 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1797 const int32x4_t b2 = vld1q_s32(bias_data + 8);
1798 const int32x4_t b3 = vld1q_s32(bias_data + 12);
1799 for (; i < num_output_pixels; i++)
1800 {
1801 vst1q_s32(acc_buffer + 16 * i + 0, b0);
1802 vst1q_s32(acc_buffer + 16 * i + 4, b1);
1803 vst1q_s32(acc_buffer + 16 * i + 8, b2);
1804 vst1q_s32(acc_buffer + 16 * i + 12, b3);
1805 }
1806 }
1807#endif
1808 for (; i < num_output_pixels; i++)
1809 {
1810 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1811 }
1812}
1813
1814inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape,
1815 const uint8_t *input_data, const Shape &filter_shape,
1816 const uint8_t *filter_data, const Shape &bias_shape,
1817 const int32_t *bias_data, const Shape &output_shape,
1818 uint8_t *output_data, int thread_start, int thread_end,
1819 int thread_dim)
1820{
1821 (void)bias_shape;
1822 const int stride_width = params.stride_width;
1823 const int stride_height = params.stride_height;
1824 const int pad_width = params.padding_values.width;
1825 const int pad_height = params.padding_values.height;
1826 const int depth_multiplier = params.depth_multiplier;
1827 const int32_t output_activation_min = params.quantized_activation_min;
1828 const int32_t output_activation_max = params.quantized_activation_max;
1829 const int32_t input_offset = params.input_offset;
1830 const int32_t filter_offset = params.weights_offset;
1831 const int32_t output_offset = params.output_offset;
1832 const int32_t output_multiplier = params.output_multiplier;
1833 const int output_shift = params.output_shift;
1834 const int dilation_width_factor = params.dilation_width_factor;
1835 const int dilation_height_factor = params.dilation_height_factor;
1836 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1837 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1838 const int input_height = input_shape.Dims(1);
1839 const int input_width = input_shape.Dims(2);
1840 const int input_depth = input_shape.Dims(3);
1841 const int filter_height = filter_shape.Dims(1);
1842 const int filter_width = filter_shape.Dims(2);
1843 const int output_height = output_shape.Dims(1);
1844 const int output_width = output_shape.Dims(2);
1845#ifdef USE_NEON
1846 const bool shift_left = (output_shift > 0);
1847 const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1848#endif
1849
1850 static const int kAccBufferMaxSize = 2048;
1851 int32_t acc_buffer[kAccBufferMaxSize];
1852 assert(kAccBufferMaxSize >= output_depth);
1853 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1854 [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1855 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1856 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1857 assert(kOutputPixelsInAccBuffer >= 1);
1858 assert(thread_dim == 0 || thread_dim == 1);
1859
1860 // row_accum_func will point to the core accumulation function to be used
1861 // for this DepthwiseConv op.
1862 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1863 row_accum_func_t row_accum_func = nullptr;
1864
1865#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1866 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1867 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1868 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1869 { \
1870 row_accum_func = \
1871 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1872 }
1873
1874#ifdef USE_NEON
1875 // We go over our list of kernels by decreasing order of preference
1876 // for the cases where multiple kernels could apply.
1877
1878 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1879
1890
1891 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1892 // They are a bit less efficient, but allow stride!=1.
1893
1903
1904 // Finally, the kernels allowing a variable input depth,
1905 // these are the least efficient but most general kernels.
1906
1910#endif // USE_NEON
1911
1912 // No matching fast kernel found, use slow fallback.
1913 if (!row_accum_func)
1914 {
1916 }
1917
1918#undef TFMINI_USE_DEPTHWISECONV_KERNEL
1919
1920 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1921 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1922 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1923
1924 // Now that we have determined row_accum_func, we can start work.
1925 int batch_start = 0;
1926 int batch_end = batches;
1927 int row_start = 0;
1928 int row_end = output_height;
1929 int output_ptr_offset = 0;
1930
1931 switch (thread_dim)
1932 {
1933 case 0:
1934 // Multithread along with the batch axis
1935 assert(thread_start >= 0);
1936 assert(thread_end <= batches);
1937 batch_start = thread_start;
1938 batch_end = thread_end;
1939 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1940 break;
1941 case 1:
1942 // Multithread along with the row axis
1943 assert(thread_start >= 0);
1944 assert(thread_end <= output_height);
1945 row_start = thread_start;
1946 row_end = thread_end;
1947 output_ptr_offset = row_start * output_width * output_depth;
1948 break;
1949 }
1950
1951 uint8_t *output_ptr = output_data + output_ptr_offset;
1952 int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1953 for (int b = batch_start; b < batch_end; ++b)
1954 {
1955 for (int out_y = row_start; out_y < row_end; ++out_y)
1956 {
1957 const int in_y_origin = (out_y * stride_height) - pad_height;
1958 const int filter_y_start =
1959 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1960 const int filter_y_end =
1961 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1962 dilation_height_factor);
1963 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1964 out_x_buffer_start += kOutputPixelsInAccBuffer)
1965 {
1966 const int out_x_buffer_end =
1967 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1968 // We call a 'pixel' a group of activation that share all but the
1969 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1970 // output pixels that we will accumulate in this loop iteration.
1971 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1972 // Initialize our local accumulator with the bias values, so we don't
1973 // have to add them later.
1974 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1975 // Accumulation loop. Most of the time should be spent in here.
1976 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1977 {
1978 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1979 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1980 input_data + in_y * input_height_stride + b * input_batch_stride,
1981 input_offset, pad_width, depth_multiplier, filter_width,
1982 filter_data + filter_y * filter_height_stride, filter_offset,
1983 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1984 }
1985 // Finished accumulating int32_t values. Now need to convert them to
1986 // the final 8bit form and store them.
1987 const int num_output_values = output_depth * num_output_pixels;
1988 int i = 0;
1989#ifdef USE_NEON
1990 using gemmlowp::RoundingDivideByPOT;
1991 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1992 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1993 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1994 // Handle 16 values at once.
1995 // This allows us to issue 4 mutually independent int32
1996 // multiplications (vqrdmulh), which should alleviate most of their
1997 // high latency.
1998 for (; i <= num_output_values - 16; i += 16)
1999 {
2000 int32x4_t acc[4];
2001 for (int j = 0; j < 4; j++)
2002 {
2003 acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
2004 }
2005
2006 if (!shift_left)
2007 {
2008 // Fixed-point multiplication.
2009 for (int j = 0; j < 4; j++)
2010 {
2011 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2012 }
2013 for (int j = 0; j < 4; j++)
2014 {
2015 acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
2016 }
2017 }
2018 else
2019 {
2020 // Fixed-point multiplication.
2021 for (int j = 0; j < 4; j++)
2022 {
2023 acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
2024 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2025 }
2026 }
2027 // Add the output offset.
2028 for (int j = 0; j < 4; j++)
2029 {
2030 acc[j] = vaddq_s32(acc[j], output_offset_vec);
2031 }
2032 // Apply the activation function.
2033 for (int j = 0; j < 4; j++)
2034 {
2035 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2036 }
2037 for (int j = 0; j < 4; j++)
2038 {
2039 acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2040 }
2041 // Saturating cast to uint8_t and store to destination.
2042 int16x4_t acc_s16[4];
2043 for (int j = 0; j < 4; j++)
2044 {
2045 acc_s16[j] = vqmovn_s32(acc[j]);
2046 }
2047 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2048 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2049 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2050 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2051 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2052 output_ptr += 16;
2053 }
2054 // Handle 8 values at once.
2055 // Not as good as 16 (now we're only issuing 2 mutually independent
2056 // vqrdmulh instructions, so we're probably paying for their high
2057 // latency).
2058 for (; i <= num_output_values - 8; i += 8)
2059 {
2060 int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2061 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2062 if (!shift_left)
2063 {
2064 // Fixed-point multiplication.
2065 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2066 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2067 // Rounding right shift.
2068 acc0 = RoundingDivideByPOT(acc0, -output_shift);
2069 acc1 = RoundingDivideByPOT(acc1, -output_shift);
2070 }
2071 else
2072 {
2073 // Fixed-point multiplication.
2074 acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2075 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2076
2077 acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2078 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2079 }
2080 // Add the output offset.
2081 acc0 = vaddq_s32(acc0, output_offset_vec);
2082 acc1 = vaddq_s32(acc1, output_offset_vec);
2083 // Apply the activation function.
2084 acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2085 acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2086 acc0 = vminq_s32(acc0, output_activation_max_vec);
2087 acc1 = vminq_s32(acc1, output_activation_max_vec);
2088 // Saturating cast to uint8_t and store to destination.
2089 const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2090 const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2091 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2092 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2093 vst1_u8(output_ptr, res_u8);
2094 output_ptr += 8;
2095 }
2096 // Handle 4 values at once. Now we're paying the full price of the
2097 // high latency of vqrdmulh. Also, storing only 4 bytes at the end
2098 // (without any alignment) can only be done 1 byte at a time.
2099 // Yet, that is still worth doing to minimize the amount of leftover
2100 // that will have to go through the very slow scalar code.
2101 for (; i <= num_output_values - 4; i += 4)
2102 {
2103 int32x4_t acc = vld1q_s32(acc_buffer + i);
2104 if (!shift_left)
2105 {
2106 // Fixed-point multiplication.
2107 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2108 // Rounding right shift.
2109 acc = RoundingDivideByPOT(acc, -output_shift);
2110 }
2111 else
2112 {
2113 // Fixed-point multiplication.
2114 acc = vmulq_n_s32(acc, multiplier_power_of_two);
2115 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2116 }
2117 // Add the output offset.
2118 acc = vaddq_s32(acc, output_offset_vec);
2119 // Apply the activation function.
2120 acc = vmaxq_s32(acc, output_activation_min_vec);
2121 acc = vminq_s32(acc, output_activation_max_vec);
2122 // Saturating cast to uint8_t and store to destination.
2123 const int16x4_t acc_s16 = vqmovn_s32(acc);
2124 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2125 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2126 vst1_lane_u8(output_ptr + 0, res_u8, 0);
2127 vst1_lane_u8(output_ptr + 1, res_u8, 1);
2128 vst1_lane_u8(output_ptr + 2, res_u8, 2);
2129 vst1_lane_u8(output_ptr + 3, res_u8, 3);
2130 output_ptr += 4;
2131 }
2132#endif // USE_NEON
2133
2134 // Handle leftover values, one by one. This is very slow.
2135 for (; i < num_output_values; i++)
2136 {
2137 int32_t acc = acc_buffer[i];
2138 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2139 acc += output_offset;
2140 acc = std::max(acc, output_activation_min);
2141 acc = std::min(acc, output_activation_max);
2142 *output_ptr++ = static_cast<uint8_t>(acc);
2143 }
2144 }
2145 }
2146 output_ptr += batch_step;
2147 }
2148}
2149
2150} // namespace depthwise_conv
2151
2152// template <DepthwiseConvOutputRounding kOutputRounding>
2153inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
2154 const uint8_t *input_data, const Shape &filter_shape,
2155 const uint8_t *filter_data, const Shape &bias_shape,
2156 const int32_t *bias_data, const Shape &output_shape,
2157 uint8_t *output_data, int thread_start, int thread_end,
2158 int thread_dim)
2159{
2160 [[maybe_unused]] const int depth_multiplier = params.depth_multiplier;
2161 [[maybe_unused]] const int32_t output_activation_min = params.quantized_activation_min;
2162 [[maybe_unused]] const int32_t output_activation_max = params.quantized_activation_max;
2163 [[maybe_unused]] const int dilation_width_factor = params.dilation_width_factor;
2164 [[maybe_unused]] const int dilation_height_factor = params.dilation_height_factor;
2165 assert(dilation_width_factor >= 1);
2166 assert(dilation_height_factor >= 1);
2167 assert(input_shape.DimensionsCount() == 4);
2168 assert(filter_shape.DimensionsCount() == 4);
2169 assert(output_shape.DimensionsCount() == 4);
2170 assert(output_activation_min <= output_activation_max);
2171 [[maybe_unused]] const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
2172 [[maybe_unused]] const int input_depth = input_shape.Dims(3);
2173 assert(output_depth == input_depth * depth_multiplier);
2174 assert(bias_shape.FlatSize() == output_depth);
2175
2176// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
2177// Jetson TX-2. This compiler does not support the offsetof() macro.
2178#if defined(__aarch64__) && !defined(GOOGLE_L4T)
2179// TODO Use below codes
2180// // Dispatch to dot-product 3x3 kernels when supported.
2181//
2182// ruy::Context *ruy_context = cpu_backend_context->ruy_context();
2183// const bool has_dot_product_instructions =
2184// ruy_context != nullptr &&
2185// (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
2186// if (has_dot_product_instructions)
2187// {
2188// using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
2189// DotProduct3x3KernelType kernel_type =
2190// optimized_ops::depthwise_conv::CategorizeDotProductKernel(
2191// input_shape, filter_shape, params);
2192// if (kernel_type != DotProduct3x3KernelType::kNone)
2193// {
2194// optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
2195// DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
2196// filter_shape, filter_data,
2197// bias_shape,
2198// bias_data, output_shape,
2199// output_data);
2200// return;
2201// }
2202// }
2203//
2204// // Dispatch to non-dot-product 3x3 kernels when supported.
2205//
2206// const int stride_width = params.stride_width;
2207// const int stride_height = params.stride_height;
2208// const int pad_width = params.padding_values.width;
2209// const int pad_height = params.padding_values.height;
2210// const int output_shift = params.output_shift;
2211//
2212// // Call kernel optimized for depthwise convolutions using 3x3 filters if
2213// // parameters are supported.
2214// if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
2215// stride_height, dilation_width_factor,
2216// dilation_height_factor, pad_width, pad_height,
2217// depth_multiplier, output_shape, output_shift))
2218// {
2219// depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
2220// params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
2221// output_shape, output_data, thread_start, thread_end, thread_dim);
2222// return;
2223// }
2224#endif
2225
2226 depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
2227 bias_shape, bias_data, output_shape, output_data,
2228 thread_start, thread_end, thread_dim);
2229}
2230
2231inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
2232 const uint8_t *input_data, const Shape &filter_shape,
2233 const uint8_t *filter_data, const Shape &bias_shape,
2234 const int32_t *bias_data, const Shape &output_shape,
2235 uint8_t *output_data, int thread_start, int thread_end,
2236 int thread_dim)
2237{
2238 return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
2239 bias_shape, bias_data, output_shape, output_data, thread_start,
2240 thread_end, thread_dim);
2241}
2242
2243} // namespace optimized
2244} // namespace cker
2245} // namespace nnfw
2246
2247#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_UINT8_H__
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
int FlatSize() const
Definition Shape.h:181
const luci_interpreter::RuntimeShape output_shape
void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, const int32_t *bias_data, int32_t *acc_buffer)
void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, int thread_start, int thread_end, int thread_dim)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96
Definition topk_v2.h:30
PaddingValues padding_values
Definition Types.h:234