ONE - On-device Neural Engine
Loading...
Searching...
No Matches
DepthwiseConv2D.cpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
18
19#include "kernels/Utils.h"
20
21#include "PALDepthwiseConv2d.h"
22
23#include <stdexcept>
24
25namespace luci_interpreter
26{
27namespace kernels
28{
29
30DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
31 Tensor *output, Tensor *scratchpad,
32 const DepthwiseConv2DParams &params)
33 : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output, scratchpad}, params)
34{
35}
36
38{
39 // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
40 // | input filter bias output |
41 // ----+---------------------------+
42 // (1) | float float float float |
43 // (2) | float int8 float float | hybrid
44 // (3) | uint8 uint8 int32 uint8 | quantized
45 // (4) | int8 int8 int32 int8 | quantized per channel
46 // (5) | int16 int8 int64 int16 | quantized per channel 16x8
47 //
48 // We only support (1), (3) and (4) for now, and additionally the following:
49 // | input filter bias output |
50 // ----+---------------------------+
51 // (5) | int16 int16 int64 int16 |
52 //
53 if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
54 {
55 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
56 }
57 else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
58 {
59 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
60 }
61 else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
62 {
63 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
64 LUCI_INTERPRETER_CHECK(static_cast<uint32_t>(filter()->shape().dim(3)) ==
65 filter()->scales().size());
66 for (auto zerop : filter()->zero_points())
67 {
68 LUCI_INTERPRETER_CHECK(zerop == 0);
69 }
70 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
71 }
72 else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
73 {
74 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
75 }
76 else
77 {
78 throw std::runtime_error("luci-intp DepthwiseConv2D(1) Unsupported type.");
79 }
80 LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
81
82 const Shape &input_shape = input()->shape();
83 const Shape &filter_shape = filter()->shape();
84 LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
85
86 const int32_t batches = input_shape.dim(0);
87 const int32_t input_height = input_shape.dim(1);
88 const int32_t input_width = input_shape.dim(2);
89 // Filter format: [1, H, W, O].
90 LUCI_INTERPRETER_CHECK(filter_shape.dim(0) == 1);
91 const int32_t filter_height = filter_shape.dim(1);
92 const int32_t filter_width = filter_shape.dim(2);
93 const int32_t channels_out = filter_shape.dim(3);
94
95 LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
96 bias()->shape().dim(0) == channels_out));
97
98 const int32_t output_height =
99 computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
101 const int32_t output_width =
102 computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
104
106 input_height, filter_height, output_height);
107 _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
108 filter_width, output_width);
109
110 output()->resize({batches, output_height, output_width, channels_out});
111
112 tflite::DepthwiseParams params{};
113
116
117 auto scratchpad = getOutputTensors()[1];
118 luci_interpreter_pal::SetupScratchpadTensor(scratchpad, params, input()->element_type(),
121}
122
124{
125 switch (input()->element_type())
126 {
127 case DataType::FLOAT32:
128 if (filter()->element_type() == DataType::FLOAT32)
129 {
130 evalFloat();
131 break;
132 }
133 throw std::runtime_error("luci-intp DepthwiseConv2D(2) Unsupported type.");
134 case DataType::U8:
135 if (filter()->scales().size() == 1)
136 {
137 evalQuantized();
138 }
139 else if (filter()->scales().size() > 1)
140 {
141 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
142 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
143 static_cast<size_t>(filter()->shape().dim(3)));
144 evalQuantizedPerChannel();
145 }
146 break;
147 case DataType::S8:
148 evalQuantizedS8PerChannel();
149 break;
150 case DataType::S16:
151 evalQuantizedS16();
152 break;
153 default:
154 throw std::runtime_error("luci-intp DepthwiseConv2D(3) Unsupported type.");
155 }
156}
157
158void DepthwiseConv2D::evalFloat() const
159{
160 float activation_min{};
161 float activation_max{};
162 calculateActivationRange(_params.activation, &activation_min, &activation_max);
163
164 tflite::DepthwiseParams params{};
165 params.padding_values.height = _padding_height;
166 params.padding_values.width = _padding_width;
172 params.float_activation_min = activation_min;
173 params.float_activation_max = activation_max;
174
175 tflite::reference_ops::DepthwiseConv(
176 params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
177 getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
178 getTensorShape(output()), getTensorData<float>(output()));
179}
180
181void DepthwiseConv2D::evalQuantizedPerChannel() const
182{
183 const auto *input_data = getTensorData<uint8_t>(input());
184 const auto *filter_data = getTensorData<uint8_t>(filter());
185 const auto *bias_data = getTensorData<int32_t>(bias());
186 auto *output_data = getTensorData<uint8_t>(output());
187
188 const Shape &input_shape = input()->shape();
189 const Shape &filter_shape = filter()->shape();
190 const Shape &output_shape = output()->shape();
191
192 const int32_t batches = input_shape.dim(0);
193 const int32_t input_height = input_shape.dim(1);
194 const int32_t input_width = input_shape.dim(2);
195 const int32_t input_depth = input_shape.dim(3);
196 const int32_t filter_height = filter_shape.dim(1);
197 const int32_t filter_width = filter_shape.dim(2);
198 const int32_t output_height = output_shape.dim(1);
199 const int32_t output_width = output_shape.dim(2);
200
201 const int32_t stride_height = _params.stride_height;
202 const int32_t stride_width = _params.stride_width;
203 const int32_t dilation_height_factor = _params.dilation_height_factor;
204 const int32_t dilation_width_factor = _params.dilation_width_factor;
205 const int32_t depth_multiplier = _params.depth_multiplier;
206
207 int32_t activation_min{};
208 int32_t activation_max{};
209 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
210
211 const std::vector<double> effective_output_scales =
213
214 std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
215 quantizeMultipliers(effective_output_scales);
216 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
217
218 for (int batch = 0; batch < batches; ++batch)
219 {
220 for (int out_y = 0; out_y < output_height; ++out_y)
221 {
222 for (int out_x = 0; out_x < output_width; ++out_x)
223 {
224 for (int in_channel = 0; in_channel < input_depth; ++in_channel)
225 {
226 for (int m = 0; m < depth_multiplier; ++m)
227 {
228 const int output_channel = m + in_channel * depth_multiplier;
229 const int in_x_origin = (out_x * stride_width) - _padding_width;
230 const int in_y_origin = (out_y * stride_height) - _padding_height;
231 int32 acc = 0;
232 for (int filter_y = 0; filter_y < filter_height; ++filter_y)
233 {
234 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
235 {
236 const int in_x = in_x_origin + dilation_width_factor * filter_x;
237 const int in_y = in_y_origin + dilation_height_factor * filter_y;
238 // Zero padding by omitting the areas outside the image.
239 const bool is_point_inside_image =
240 (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
241 if (is_point_inside_image)
242 {
243 int32 input_val =
244 input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
245 int32 filter_val =
246 filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
247 acc += (filter_val - filter()->zero_points()[output_channel]) *
248 (input_val - input()->zero_point());
249 }
250 }
251 }
252 if (bias_data)
253 {
254 acc += bias_data[output_channel];
255 }
256 int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
257 int output_shift = quant_multipliers[output_channel].shift;
258 int32_t scaled_acc =
259 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
260 scaled_acc += output()->zero_point();
261 scaled_acc = std::max(scaled_acc, activation_min);
262 scaled_acc = std::min(scaled_acc, activation_max);
263 output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
264 static_cast<uint8_t>(scaled_acc);
265 }
266 }
267 }
268 }
269 }
270}
271
272void DepthwiseConv2D::evalQuantized() const
273{
274 const auto input_scale = static_cast<double>(input()->scale());
275 const auto filter_scale = static_cast<double>(filter()->scale());
276 const auto output_scale = static_cast<double>(output()->scale());
277
278 const double real_multiplier = input_scale * filter_scale / output_scale;
279 int32_t output_multiplier{};
280 int output_shift{};
281 quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
282
283 int32_t activation_min{};
284 int32_t activation_max{};
285 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
286
287 tflite::DepthwiseParams params{};
288 params.padding_values.height = _padding_height;
289 params.padding_values.width = _padding_width;
295 // The kernel expects input and filter zero points to be negated.
296 params.input_offset = -input()->zero_point(); // Note the '-'.
297 params.weights_offset = -filter()->zero_point(); // Note the '-'.
298 params.output_offset = output()->zero_point();
299 params.output_multiplier = output_multiplier;
300 params.output_shift = output_shift;
301 params.quantized_activation_min = activation_min;
302 params.quantized_activation_max = activation_max;
303
304 tflite::reference_ops::DepthwiseConv(
305 params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
306 getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
307 getTensorShape(output()), getTensorData<uint8_t>(output()));
308}
309
310void DepthwiseConv2D::evalQuantizedS8PerChannel() const
311{
312 int32_t activation_min{};
313 int32_t activation_max{};
314 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
315
316 tflite::DepthwiseParams params{};
317
318 params.padding_type = tflite::PaddingType::kSame;
319 params.padding_values.height = _padding_height;
320 params.padding_values.width = _padding_width;
326 // The kernel expects input and filter zero points to be negated.
327 params.input_offset = -input()->zero_point(); // Note the '-'.
328 params.weights_offset = 0;
329 params.output_offset = output()->zero_point();
330 params.output_multiplier = 1; // unused in tflite code
331 params.output_shift = 0; // unused in tflite code
332 params.quantized_activation_min = activation_min;
333 params.quantized_activation_max = activation_max;
334
335 const std::vector<double> effective_output_scales =
337
338 std::vector<ChannelQuantMultipliers> quant_multipliers =
339 quantizeMultipliers(effective_output_scales);
340
341 std::vector<int32_t> shifts;
342 std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
343 [](ChannelQuantMultipliers cm) { return cm.shift; });
344 std::vector<int32_t> multipliers;
345 std::transform(quant_multipliers.begin(), quant_multipliers.end(),
346 std::back_inserter(multipliers),
347 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
348
349 auto scratchpad = getOutputTensors()[1];
350 int8_t *scratchpad_data = nullptr;
351 if (scratchpad->is_allocatable())
352 scratchpad_data = scratchpad->data<int8_t>();
353
355 params, multipliers.data(), shifts.data(), getTensorShape(input()),
356 getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
357 getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
358 getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
359}
360
361void DepthwiseConv2D::evalQuantizedS16() const
362{
363 const auto *input_data = getTensorData<int16_t>(input());
364 const auto *filter_data = getTensorData<int16_t>(filter());
365 const auto *bias_data = getTensorData<int64_t>(bias());
366 auto *output_data = getTensorData<int16_t>(output());
367
368 const Shape &input_shape = input()->shape();
369 const Shape &filter_shape = filter()->shape();
370 const Shape &output_shape = output()->shape();
371
372 const int32_t batches = input_shape.dim(0);
373 const int32_t input_height = input_shape.dim(1);
374 const int32_t input_width = input_shape.dim(2);
375 const int32_t input_depth = input_shape.dim(3);
376 const int32_t filter_height = filter_shape.dim(1);
377 const int32_t filter_width = filter_shape.dim(2);
378 const int32_t output_height = output_shape.dim(1);
379 const int32_t output_width = output_shape.dim(2);
380
381 const int32_t stride_height = _params.stride_height;
382 const int32_t stride_width = _params.stride_width;
383 const int32_t dilation_height_factor = _params.dilation_height_factor;
384 const int32_t dilation_width_factor = _params.dilation_width_factor;
385 const int32_t depth_multiplier = _params.depth_multiplier;
386
387 const std::vector<double> effective_output_scales =
389
390 std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
391 quantizeMultipliers(effective_output_scales);
392
393 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
394
395 int32_t activation_min{};
396 int32_t activation_max{};
397 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
398
399 for (int32_t batch = 0; batch < batches; ++batch)
400 {
401 for (int32_t out_y = 0; out_y < output_height; ++out_y)
402 {
403 for (int32_t out_x = 0; out_x < output_width; ++out_x)
404 {
405 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
406 {
407 for (int32_t m = 0; m < depth_multiplier; ++m)
408 {
409 const int32_t out_c = m + in_c * depth_multiplier;
410 const int32_t in_y_origin = out_y * stride_height - _padding_height;
411 const int32_t in_x_origin = out_x * stride_width - _padding_width;
412 int64_t acc = 0;
413 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
414 {
415 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
416 {
417 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
418 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
419 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
420 {
421 const int16_t input_val =
422 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
423 const int16_t filter_val =
424 filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
425 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
426 }
427 }
428 }
429 if (bias_data != nullptr)
430 {
431 acc += bias_data[out_c];
432 }
433
434 int32_t output_multiplier = quant_multipliers[out_c].multiplier;
435 int output_shift = quant_multipliers[out_c].shift;
436 int32_t scaled_acc =
437 tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
438
439 scaled_acc = std::max(scaled_acc, activation_min);
440 scaled_acc = std::min(scaled_acc, activation_max);
441
442 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
443 }
444 }
445 }
446 }
447 }
448}
449
450} // namespace kernels
451} // namespace luci_interpreter
const std::vector< Tensor * > & getOutputTensors() const
Definition Kernel.h:40
const DepthwiseConv2DParams & params() const
Definition Kernel.h:67
int32_t dim(int i) const
Definition Tensor.h:41
int num_dims() const
Definition Tensor.h:39
void resize(const Shape &new_shape)
Definition Tensor.cpp:56
const Shape & shape() const
Definition Tensor.h:107
float scale() const
Definition Tensor.h:109
const std::vector< int32_t > & zero_points() const
Definition Tensor.h:123
int32_t zero_point() const
Definition Tensor.h:115
DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, Tensor *scratchpad, const DepthwiseConv2DParams &params)
#define LUCI_INTERPRETER_CHECK(cond)
Definition Utils.h:36
const luci_interpreter::RuntimeShape output_shape
list input_data
Definition infer.py:29
int32_t computePadding(int32_t stride, int32_t dilation_rate, int32_t in_size, int32_t filter_size, int32_t out_size)
Definition Utils.h:41
int32_t calcOffset(const Shape &shape, int32_t d0, int32_t d1, int32_t d2, int32_t d3)
Definition Utils.h:75
std::vector< ChannelQuantMultipliers > quantizeMultipliers(const std::vector< double > &effective_scale)
Definition Utils.h:170
tflite::RuntimeShape getTensorShape(const Tensor *tensor)
Definition Utils.h:194
void calculateActivationRange(Activation activation, T *activation_min, T *activation_max)
Definition Utils.cpp:52
void calculateActivationRangeQuantized(Activation activation, const Tensor *output, int32_t *activation_min, int32_t *activation_max)
Definition Utils.cpp:119
void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.cpp:157
int32_t computeOutputSize(Padding padding, int32_t image_size, int32_t filter_size, int32_t stride, int32_t dilation_rate=1)
Definition Utils.h:59
std::vector< double > getQuantizedConvolutionMultiplers(float input_scale, const std::vector< float > &filter_scale, float output_scale)
Definition Utils.h:147
void DepthwiseConvPerChannel< int8_t >(const tflite::DepthwiseParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
int32_t size[5]
Definition Slice.cpp:35
Definition Shape.h:28
int32_t int32
Definition topk_v2.h:27