ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Conv2D.cpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include "kernels/Conv2D.h"
19
20#include "kernels/Utils.h"
21
22#include "PALConv2d.h"
23
24#include <stdexcept>
25#include <thread>
26
27namespace luci_interpreter
28{
29namespace kernels
30{
31
32Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
33 Tensor *scratchpad, const Conv2DParams &params)
34 : KernelWithParams<Conv2DParams>({input, filter, bias}, {output, scratchpad}, params)
35{
36}
37
39{
40 // TensorFlow Lite (as of v2.2.0) supports the following combinations of types:
41 // | input filter bias output |
42 // ----+---------------------------+
43 // (1) | float float float float |
44 // (2) | float int8 float float | hybrid
45 // (3) | uint8 uint8 int32 uint8 | quantized
46 // (4) | int8 int8 int32 int8 | quantized per channel
47 //
48 // We only support (1), (3) and (4) for now, and additionally the following:
49 // | input filter bias output |
50 // ----+---------------------------+
51 // (5) | int16 int16 int64 int16 |
52 //
53 if (input()->element_type() == DataType::FLOAT32 && filter()->element_type() == DataType::FLOAT32)
54 {
55 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::FLOAT32);
56 }
57 else if (input()->element_type() == DataType::U8 && filter()->element_type() == DataType::U8)
58 {
59 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
60 }
61 else if (input()->element_type() == DataType::S8 && filter()->element_type() == DataType::S8)
62 {
63 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S32);
64 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
65 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
66 static_cast<size_t>(filter()->shape().dim(0)));
67 for (auto zerop : filter()->zero_points())
68 {
69 LUCI_INTERPRETER_CHECK(zerop == 0);
70 }
71 }
72 else if (input()->element_type() == DataType::S16 && filter()->element_type() == DataType::S16)
73 {
74 LUCI_INTERPRETER_CHECK(bias() == nullptr || bias()->element_type() == DataType::S64);
75 }
76 else
77 {
78 throw std::runtime_error("luci-intp Conv2D(1) Unsupported type.");
79 }
80 LUCI_INTERPRETER_CHECK(output()->element_type() == input()->element_type());
81
82 const Shape &input_shape = input()->shape();
83 const Shape &filter_shape = filter()->shape();
84 LUCI_INTERPRETER_CHECK(input_shape.num_dims() == 4 && filter_shape.num_dims() == 4);
85
86 const int32_t batches = input_shape.dim(0);
87 const int32_t input_height = input_shape.dim(1);
88 const int32_t input_width = input_shape.dim(2);
89 const int32_t output_depth = filter_shape.dim(0);
90 const int32_t filter_height = filter_shape.dim(1);
91 const int32_t filter_width = filter_shape.dim(2);
92 LUCI_INTERPRETER_CHECK(filter_shape.dim(3) == input_shape.dim(3));
93
94 LUCI_INTERPRETER_CHECK(bias() == nullptr || (bias()->shape().num_dims() == 1 &&
95 bias()->shape().dim(0) == output_depth));
96
97 const int32_t output_height =
98 computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
100 const int32_t output_width =
101 computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
103
105 input_height, filter_height, output_height);
106 _padding_width = computePadding(_params.stride_width, _params.dilation_width_factor, input_width,
107 filter_width, output_width);
108
109 output()->resize({batches, output_height, output_width, output_depth});
110
111 // Allocate tensor for scratchpad, if needed.
112 tflite::ConvParams params{};
113 params.padding_values.height = _padding_height;
114 params.padding_values.width = _padding_width;
119 auto scratchpad = getOutputTensors()[1];
120 luci_interpreter_pal::SetupScratchpadTensor(scratchpad, input()->element_type(), params,
123
124 switch (_params.activation)
125 {
126 case Activation::NONE:
127 case Activation::RELU:
128 case Activation::RELU6:
129 case Activation::RELU_N1_TO_1:
130 break;
131 default:
132 throw std::runtime_error("Unsupported fused activation");
133 }
134}
135
136void Conv2D::execute() const
137{
138 switch (input()->element_type())
139 {
140 case DataType::FLOAT32:
141 if (filter()->element_type() == DataType::FLOAT32)
142 {
143 evalFloat();
144 break;
145 }
146 throw std::runtime_error("luci-intp Conv2D(2) Unsupported type.");
147 case DataType::U8:
148 if (filter()->scales().size() == 1)
149 {
150 evalQuantized();
151 }
152 else if (filter()->scales().size() > 1)
153 {
154 LUCI_INTERPRETER_CHECK(filter()->shape().num_dims() == 4);
155 LUCI_INTERPRETER_CHECK(filter()->scales().size() ==
156 static_cast<size_t>(filter()->shape().dim(0)));
157 evalQuantizedPerChannel();
158 }
159 break;
160 case DataType::S8:
161 evalQuantizedS8PerChannel();
162 break;
163 case DataType::S16:
164 evalQuantizedS16();
165 break;
166 default:
167 throw std::runtime_error("luci-intp Conv2D(3) Unsupported type.");
168 }
169}
170
171void Conv2D::evalFloat() const
172{
173 float activation_min{};
174 float activation_max{};
175 calculateActivationRange(_params.activation, &activation_min, &activation_max);
176
177 tflite::ConvParams params{};
178 params.padding_values.height = _padding_height;
179 params.padding_values.width = _padding_width;
184 params.float_activation_min = activation_min;
185 params.float_activation_max = activation_max;
186
187 auto scratchpad = getOutputTensors()[1];
188 float *scratchpad_data = nullptr;
189 if (scratchpad->is_allocatable())
190 scratchpad_data = scratchpad->data<float>();
191
192 luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
193 getTensorShape(filter()), getTensorData<float>(filter()),
194 getTensorShape(bias()), getTensorData<float>(bias()),
195 getTensorShape(output()), getTensorData<float>(output()),
196 getTensorShape(scratchpad), scratchpad_data);
197}
198
199void Conv2D::evalQuantized() const
200{
201 const auto input_scale = static_cast<double>(input()->scale());
202 const auto filter_scale = static_cast<double>(filter()->scale());
203 const auto output_scale = static_cast<double>(output()->scale());
204
205 const double real_multiplier = input_scale * filter_scale / output_scale;
206 int32_t output_multiplier{};
207 int output_shift{};
208 quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
209
210 int32_t activation_min{};
211 int32_t activation_max{};
212 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
213
214 tflite::ConvParams params{};
215 params.padding_values.height = _padding_height;
216 params.padding_values.width = _padding_width;
221 // The kernel expects input and filter zero points to be negated.
222 params.input_offset = -input()->zero_point(); // Note the '-'.
223 params.weights_offset = -filter()->zero_point(); // Note the '-'.
224 params.output_offset = output()->zero_point();
225 params.output_multiplier = output_multiplier;
226 params.output_shift = output_shift;
227 params.quantized_activation_min = activation_min;
228 params.quantized_activation_max = activation_max;
229
230 auto scratchpad = getOutputTensors()[1];
231 luci_interpreter_pal::Conv(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
232 getTensorShape(filter()), getTensorData<uint8_t>(filter()),
233 getTensorShape(bias()), getTensorData<int32_t>(bias()),
234 getTensorShape(output()), getTensorData<uint8_t>(output()),
235 getTensorShape(scratchpad), getTensorData<uint8_t>(scratchpad));
236}
237
238void Conv2D::evalQuantizedPerChannel() const
239{
240 const auto *input_data = getTensorData<uint8_t>(input());
241 const auto *filter_data = getTensorData<uint8_t>(filter());
242 const auto *bias_data = getTensorData<int32_t>(bias());
243 auto *output_data = getTensorData<uint8_t>(output());
244
245 const Shape &input_shape = input()->shape();
246 const Shape &filter_shape = filter()->shape();
247 const Shape &output_shape = output()->shape();
248
249 const int32_t batches = input_shape.dim(0);
250 const int32_t input_height = input_shape.dim(1);
251 const int32_t input_width = input_shape.dim(2);
252 const int32_t input_depth = input_shape.dim(3);
253 const int32_t output_depth = filter_shape.dim(0);
254 const int32_t filter_height = filter_shape.dim(1);
255 const int32_t filter_width = filter_shape.dim(2);
256 const int32_t output_height = output_shape.dim(1);
257 const int32_t output_width = output_shape.dim(2);
258
259 const int32_t stride_height = _params.stride_height;
260 const int32_t stride_width = _params.stride_width;
261 const int32_t dilation_height_factor = _params.dilation_height_factor;
262 const int32_t dilation_width_factor = _params.dilation_width_factor;
263
264 int32_t activation_min{};
265 int32_t activation_max{};
266 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
267
268 const std::vector<double> effective_output_scale =
270
271 const std::vector<ChannelQuantMultipliers> multipliers_raw =
272 quantizeMultipliers(effective_output_scale);
273 BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
274
275 for (int32_t batch = 0; batch < batches; ++batch)
276 {
277 for (int32_t out_y = 0; out_y < output_height; ++out_y)
278 {
279 for (int32_t out_x = 0; out_x < output_width; ++out_x)
280 {
281 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
282 {
283 const int32_t in_y_origin = out_y * stride_height - _padding_height;
284 const int32_t in_x_origin = out_x * stride_width - _padding_width;
285 int32_t acc = 0;
286 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
287 {
288 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
289 {
290 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
291 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
292 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
293 {
294 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
295 {
296 const uint8_t input_val =
297 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
298 const uint8_t filter_val =
299 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
300 acc += static_cast<int32_t>(input_val - input()->zero_point()) *
301 static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
302 }
303 }
304 }
305 }
306 if (bias_data)
307 {
308 acc += bias_data[out_c];
309 }
310
311 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
312 acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
313
314 scaled_acc += output()->zero_point();
315 scaled_acc = std::max(scaled_acc, activation_min);
316 scaled_acc = std::min(scaled_acc, activation_max);
317 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
318 }
319 }
320 }
321 }
322}
323
324void Conv2D::evalQuantizedS8PerChannel() const
325{
326 int32_t activation_min{};
327 int32_t activation_max{};
328 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
329
330 tflite::ConvParams params{};
331 params.padding_values.height = _padding_height;
332 params.padding_values.width = _padding_width;
337 // The kernel expects filter zero points to be negated.
338 params.input_offset = -input()->zero_point(); // Note the '-'.
339 params.weights_offset = 0; // Unused in tflite code
340 params.output_offset = output()->zero_point();
341 params.quantized_activation_min = activation_min;
342 params.quantized_activation_max = activation_max;
343
344 const std::vector<double> effective_output_scales =
346
347 std::vector<ChannelQuantMultipliers> quant_multipliers =
348 quantizeMultipliers(effective_output_scales);
349
350 std::vector<int32_t> shifts;
351 std::transform(quant_multipliers.begin(), quant_multipliers.end(), std::back_inserter(shifts),
352 [](ChannelQuantMultipliers cm) { return cm.shift; });
353 std::vector<int32_t> multipliers;
354 std::transform(quant_multipliers.begin(), quant_multipliers.end(),
355 std::back_inserter(multipliers),
356 [](ChannelQuantMultipliers cm) { return cm.multiplier; });
357
358 auto scratchpad = getOutputTensors()[1];
359 int8_t *scratchpad_data = nullptr;
360 if (scratchpad->is_allocatable())
361 scratchpad_data = scratchpad->data<int8_t>();
362
363 luci_interpreter_pal::ConvPerChannel(
364 params, multipliers.data(), shifts.data(), getTensorShape(input()),
365 getTensorData<int8_t>(input()), getTensorShape(filter()), getTensorData<int8_t>(filter()),
366 getTensorShape(bias()), getTensorData<int32_t>(bias()), getTensorShape(output()),
367 getTensorData<int8_t>(output()), getTensorShape(scratchpad), scratchpad_data);
368}
369
370void Conv2D::evalQuantizedS16() const
371{
372 const auto *input_data = getTensorData<int16_t>(input());
373 const auto *filter_data = getTensorData<int16_t>(filter());
374 const auto *bias_data = getTensorData<int64_t>(bias());
375 auto *output_data = getTensorData<int16_t>(output());
376
377 const Shape &input_shape = input()->shape();
378 const Shape &filter_shape = filter()->shape();
379 const Shape &output_shape = output()->shape();
380
381 const int32_t batches = input_shape.dim(0);
382 const int32_t input_height = input_shape.dim(1);
383 const int32_t input_width = input_shape.dim(2);
384 const int32_t input_depth = input_shape.dim(3);
385 const int32_t output_depth = filter_shape.dim(0);
386 const int32_t filter_height = filter_shape.dim(1);
387 const int32_t filter_width = filter_shape.dim(2);
388 const int32_t output_height = output_shape.dim(1);
389 const int32_t output_width = output_shape.dim(2);
390
391 const int32_t stride_height = _params.stride_height;
392 const int32_t stride_width = _params.stride_width;
393 const int32_t dilation_height_factor = _params.dilation_height_factor;
394 const int32_t dilation_width_factor = _params.dilation_width_factor;
395
396 int32_t activation_min{};
397 int32_t activation_max{};
398 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
399
400 const std::vector<double> effective_output_scale =
402
403 const std::vector<ChannelQuantMultipliers> multipliers_raw =
404 quantizeMultipliers(effective_output_scale);
405 BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
406
407 for (int32_t batch = 0; batch < batches; ++batch)
408 {
409 for (int32_t out_y = 0; out_y < output_height; ++out_y)
410 {
411 for (int32_t out_x = 0; out_x < output_width; ++out_x)
412 {
413 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
414 {
415 const int32_t in_y_origin = out_y * stride_height - _padding_height;
416 const int32_t in_x_origin = out_x * stride_width - _padding_width;
417 int64_t acc = 0;
418 for (int32_t filter_y = 0; filter_y < filter_height; ++filter_y)
419 {
420 for (int32_t filter_x = 0; filter_x < filter_width; ++filter_x)
421 {
422 const int32_t in_y = in_y_origin + dilation_height_factor * filter_y;
423 const int32_t in_x = in_x_origin + dilation_width_factor * filter_x;
424 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
425 {
426 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
427 {
428 const int16_t input_val =
429 input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
430 const int16_t filter_val =
431 filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
432 acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
433 }
434 }
435 }
436 }
437 if (bias_data)
438 {
439 acc += bias_data[out_c];
440 }
441
442 int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
443 acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
444
445 scaled_acc = std::max(scaled_acc, activation_min);
446 scaled_acc = std::min(scaled_acc, activation_max);
447
448 output_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] = scaled_acc;
449 }
450 }
451 }
452 }
453}
454
455} // namespace kernels
456} // namespace luci_interpreter
const std::vector< Tensor * > & getOutputTensors() const
Definition Kernel.h:40
const Conv2DParams & params() const
Definition Kernel.h:67
int32_t dim(int i) const
Definition Tensor.h:41
int num_dims() const
Definition Tensor.h:39
void resize(const Shape &new_shape)
Definition Tensor.cpp:56
const Shape & shape() const
Definition Tensor.h:107
float scale() const
Definition Tensor.h:109
const std::vector< int32_t > & zero_points() const
Definition Tensor.h:123
int32_t zero_point() const
Definition Tensor.h:115
const Tensor * input() const
Definition Conv2D.h:36
Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output, Tensor *scratchpad, const Conv2DParams &params)
Definition Conv2D.cpp:32
const Tensor * bias() const
Definition Conv2D.h:38
void execute() const override
Definition Conv2D.cpp:136
const Tensor * filter() const
Definition Conv2D.h:37
#define LUCI_INTERPRETER_CHECK(cond)
Definition Utils.h:36
const luci_interpreter::RuntimeShape output_shape
list input_data
Definition infer.py:29
int32_t computePadding(int32_t stride, int32_t dilation_rate, int32_t in_size, int32_t filter_size, int32_t out_size)
Definition Utils.h:41
int32_t calcOffset(const Shape &shape, int32_t d0, int32_t d1, int32_t d2, int32_t d3)
Definition Utils.h:75
std::vector< ChannelQuantMultipliers > quantizeMultipliers(const std::vector< double > &effective_scale)
Definition Utils.h:170
tflite::RuntimeShape getTensorShape(const Tensor *tensor)
Definition Utils.h:194
void calculateActivationRange(Activation activation, T *activation_min, T *activation_max)
Definition Utils.cpp:52
void calculateActivationRangeQuantized(Activation activation, const Tensor *output, int32_t *activation_min, int32_t *activation_max)
Definition Utils.cpp:119
void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.cpp:157
int32_t computeOutputSize(Padding padding, int32_t image_size, int32_t filter_size, int32_t stride, int32_t dilation_rate=1)
Definition Utils.h:59
std::vector< double > getQuantizedConvolutionMultiplers(float input_scale, const std::vector< float > &filter_scale, float output_scale)
Definition Utils.h:147
int32_t size[5]
Definition Slice.cpp:35
Definition Shape.h:28