ONE - On-device Neural Engine
Loading...
Searching...
No Matches
DepthwiseConvolutionLayer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
18
21
22namespace onert
23{
24namespace backend
25{
26namespace cpu
27{
28namespace ops
29{
30
31void DepthwiseConvolutionLayer::prepareF32()
32{
34 return;
35
36 // DepthwiseConvOp cpu kernel needs additional memory to perform with multi-
37 // threads. So, we allocate it here and pass it to the kernel.
38 const int64_t k_packet_size = nnfw::cker::eigen_support::kPacketSize<float>();
39
40 const auto out_shape = getShape(_output);
41 const auto filter_shape = getShape(_kernel);
42 const int batch = out_shape.Dims(0);
43 const int out_depth = out_shape.Dims(3);
44 const int filter_rows = filter_shape.Dims(1);
45 const int filter_cols = filter_shape.Dims(2);
46
47 const int filter_spatial_size = filter_rows * filter_cols;
48 const int padded_filter_inner_dim_size =
49 ((out_depth + k_packet_size - 1) / k_packet_size) * k_packet_size;
50
51 _use_padded_filter = (out_depth % k_packet_size) == 0 ? false : true;
52
53 // prepare padded_filter buffer for cker
54 auto padded_filter_info = ir::OperandInfo(_kernel->get_info());
55 padded_filter_info.shape({batch, filter_spatial_size, padded_filter_inner_dim_size});
56 _padded_filter = std::make_unique<Tensor>(padded_filter_info, nullptr);
57 _padded_filter->setBuffer(std::make_shared<basic::Allocator>(_padded_filter->total_size()));
58
59 // prepare out_bprop and in_bprop buffer for cker
60 const int thread_count = nnfw::cker::eigen_support::getThreadCount() + 1;
61
62 auto filter_buffers_info = ir::OperandInfo(_kernel->get_info());
63 filter_buffers_info.shape({thread_count, filter_spatial_size, padded_filter_inner_dim_size});
64 _filter_buffers = std::make_unique<Tensor>(filter_buffers_info, nullptr);
65 _filter_buffers->setBuffer(std::make_shared<basic::Allocator>(_filter_buffers->total_size()));
66}
67
69{
70 float output_activation_min = 0, output_activation_max = 0;
71 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
72
74 op_params.stride_width = _strideWidth;
75 op_params.stride_height = _strideHeight;
80 op_params.depth_multiplier = _multiplier;
81 op_params.float_activation_min = output_activation_min;
82 op_params.float_activation_max = output_activation_max;
83
84 // Since DepthwiseConvOp does not support dilation and different W/H stride yet,
85 // it uses the existing kernel in this case.
87 {
88 nnfw::cker::DepthwiseConvOp(op_params, getShape(_input), getBuffer<float>(_input),
89 getShape(_kernel), getBuffer<float>(_kernel), getShape(_bias),
90 getBuffer<float>(_bias), getBuffer<float>(_padded_filter.get()),
91 _use_padded_filter, getBuffer<float>(_filter_buffers.get()),
92 getShape(_output), getBuffer<float>(_output));
93 }
94 else
95 {
96 nnfw::cker::DepthwiseConv<float, float>(
97 op_params, getShape(_input), getBuffer<float>(_input), getShape(_kernel),
98 getBuffer<float>(_kernel), getShape(_bias), getBuffer<float>(_bias), getShape(_output),
99 getBuffer<float>(_output), _external_context->ruy_context());
100 }
101}
102
104{
105 int32_t output_activation_min = 0;
106 int32_t output_activation_max = 0;
108 &output_activation_max);
109
110 double real_multiplier = 0.0;
111 int32_t output_multiplier = 0;
112 int32_t output_shift = 0;
114 QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
115
117 op_params.stride_width = _strideWidth;
118 op_params.stride_height = _strideHeight;
123 op_params.depth_multiplier = _multiplier;
124 op_params.input_offset = -_input->data_zero_point();
125 op_params.weights_offset = -_kernel->data_zero_point();
127 op_params.output_multiplier = output_multiplier;
128 op_params.output_shift = output_shift;
129 op_params.quantized_activation_min = output_activation_min;
130 op_params.quantized_activation_max = output_activation_max;
131
132 nnfw::cker::DepthwiseConv<uint8_t, int32_t>(
133 op_params, getShape(_input), getBuffer<uint8_t>(_input), getShape(_kernel),
134 getBuffer<uint8_t>(_kernel), getShape(_bias), getBuffer<int32_t>(_bias), getShape(_output),
135 getBuffer<uint8_t>(_output), _external_context->ruy_context());
136}
137
139{
143 op_params.stride_width = _strideWidth;
144 op_params.stride_height = _strideHeight;
147 op_params.depth_multiplier = _multiplier;
148 op_params.input_offset = -_input->data_zero_point();
150 int32_t output_activation_min = 0;
151 int32_t output_activation_max = 0;
153 &output_activation_max);
154 op_params.quantized_activation_min = output_activation_min;
155 op_params.quantized_activation_max = output_activation_max;
156 // NOTE: The following fields of ConvParams are not used:
157 // padding_type, weights_offset, output_{multiplier,shift}, float_activation_{min,max}
158
160 op_params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(),
161 getShape(_input), getBuffer<uint8_t>(_input), getShape(_kernel), getBuffer<uint8_t>(_kernel),
162 _kernel->data_zero_points().data(), getShape(_bias), getBuffer<int32_t>(_bias),
163 getShape(_output), getBuffer<uint8_t>(_output));
164}
165
167{
168 if (!_prepared)
169 {
170 prepareQ8i();
171 _prepared = true;
172 }
173
174 int32_t output_activation_min = 0;
175 int32_t output_activation_max = 0;
177 &output_activation_max);
178
183 op_params.depth_multiplier = _multiplier;
184 op_params.stride_width = _strideWidth;
185 op_params.stride_height = _strideHeight;
188 op_params.input_offset = -_input->data_zero_point();
189 op_params.weights_offset = 0;
191 op_params.quantized_activation_min = output_activation_min;
192 op_params.quantized_activation_max = output_activation_max;
193
195 op_params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(),
196 getShape(_input), getBuffer<int8_t>(_input), getShape(_kernel), getBuffer<int8_t>(_kernel),
197 getShape(_bias), getBuffer<int32_t>(_bias), getShape(_output), getBuffer<int8_t>(_output),
198 _external_context->ruy_context());
199}
200
202{
203 if (!_prepared)
204 {
205 prepareQ8iHybridPerChannel();
206 _prepared = true;
207 }
208
209 float output_activation_min = 0, output_activation_max = 0;
210 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
211
212 auto input_shape = getShape(_input);
213 const int batch_size = input_shape.Dims(0);
214 const int input_size = input_shape.FlatSize() / batch_size;
215
216 auto scaling_factors_ptr = _input_scaling_factors.data();
217 auto input_offsets_ptr = _input_offsets.data();
218
219 for (int b = 0; b < batch_size; ++b)
220 {
221 const int offset = b * input_size;
222 nnfw::cker::PortableAsymmetricQuantizeFloats(getBuffer<float>(_input) + offset, input_size,
223 _input_quantized.data() + offset,
224 &scaling_factors_ptr[b], &input_offsets_ptr[b]);
225 }
226
230 op_params.depth_multiplier = _multiplier;
231 op_params.stride_width = _strideWidth;
232 op_params.stride_height = _strideHeight;
235 op_params.float_activation_min = output_activation_min;
236 op_params.float_activation_max = output_activation_max;
237
239 op_params, _input_scaling_factors.data(), getShape(_input), _input_quantized.data(),
240 getShape(_kernel), getBuffer<int8_t>(_kernel), getShape(_bias), getBuffer<float>(_bias),
241 getShape(_output), getBuffer<float>(_output), _kernel->data_scales().data(),
242 _input_offsets.data());
243}
244
245void DepthwiseConvolutionLayer::prepareQ8i()
246{
249 _kernel->data_scales().size(), getShape(_kernel).Dims(3), _per_channel_output_multiplier,
250 _per_channel_output_shift);
251}
252
253void DepthwiseConvolutionLayer::prepareQ8uPerChannel()
254{
257 _kernel->data_scales().size(), getShape(_kernel).Dims(3), _per_channel_output_multiplier,
258 _per_channel_output_shift);
259}
260
261void DepthwiseConvolutionLayer::prepareQ8iHybridPerChannel()
262{
263 // allocate memory for activation quantization.
264 // - quantized values (int8_t type and same shape of original input)
265 // - quantization params (= scale/zeropoint for each input)
266 auto input_shape = getShape(_input);
267 const int batch_size = input_shape.Dims(0);
268 const int input_size = input_shape.FlatSize() / batch_size;
269 _input_quantized.resize(input_size);
270 // TODO: Optimize the case of batch_size = 1
271 _input_scaling_factors.resize(batch_size);
272 _input_offsets.resize(batch_size);
273}
274
275void DepthwiseConvolutionLayer::ensureQ8iHybridPerChannel()
276{
277 // ensure weight is per-channel quantized.
278 int32_t kernel_input_channel = getShape(_kernel).Dims(3);
279 // zero_points comes from flatbuffer vector. Its size is within uint32_t range.
280 size_t kernel_zerop_cnt = _kernel->data_scales().size();
281 // promote to int64_t to compare int32_t and uint32_t
282 if ((int64_t)kernel_input_channel != (int64_t)kernel_zerop_cnt)
283 throw std::runtime_error{"DConv2D hybrid supports only per-channel quantized weight."};
284}
285
287 const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
288 const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
289 const uint32_t paddingBottom, const uint32_t strideWidth, const uint32_t strideHeight,
290 const uint32_t multiplier, const uint32_t dilationWidth, const uint32_t dilationHeight,
291 const ir::Activation activation, IPortableTensor *output,
292 const std::shared_ptr<ExternalContext> &external_context)
293{
294 _input = input;
295 _kernel = kernel;
296 _bias = bias;
297 _paddingLeft = paddingLeft;
298 _paddingRight = paddingRight;
299 _paddingTop = paddingTop;
300 _paddingBottom = paddingBottom;
301 _strideWidth = strideWidth;
302 _strideHeight = strideHeight;
303 _multiplier = multiplier;
304 _dilationWidth = dilationWidth;
305 _dilationHeight = dilationHeight;
306 _activation = activation;
307 _output = output;
308 _external_context = external_context;
309 _is_hybrid = _input->data_type() == OperandType::FLOAT32 &&
310 _kernel->data_type() == OperandType::QUANT_INT8_SYMM;
311
312 if (_is_hybrid)
313 {
314 ensureQ8iHybridPerChannel();
315 prepareQ8iHybridPerChannel();
316 _prepared = true;
317 }
318 else if (_input->data_type() == OperandType::FLOAT32)
319 {
320 prepareF32();
321 }
322 else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
323 {
325 {
326 prepareQ8i();
327 _prepared = true;
328 }
329 }
330 else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM && _kernel->is_constant() &&
332 {
333 const bool per_channel_quantized = _kernel->data_scales().size() > 1;
334 if (per_channel_quantized)
335 {
336 prepareQ8uPerChannel();
337 _prepared = true;
338 }
339 }
340}
341
343{
344 if (_is_hybrid)
345 {
347 }
348 else if (_input->data_type() == OperandType::FLOAT32)
349 {
350 convFloat32();
351 }
352 else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
353 {
354 const bool per_channel_quantized = _kernel->data_scales().size() > 1;
355 if (per_channel_quantized)
357 else
359 }
360 else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
361 {
362 convQ8i();
363 }
364 else
365 {
366 throw std::runtime_error{"DepthwiseConv: unsupported data type"};
367 }
368}
369
370} // namespace ops
371} // namespace cpu
372} // namespace backend
373} // namespace onert
int32_t Dims(int i) const
Definition Shape.h:92
A tensor class that is portable for other backends.
const std::vector< float > & data_scales() const override final
float data_scale() const override final
const ir::OperandInfo & get_info() const
int32_t data_zero_point() const override final
const std::vector< int32_t > & data_zero_points() const override
ir::DataType data_type() const override final
bool is_dynamic() const override final
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
bool is_constant() const override final
Return true if the tensor is constant.
void configure(const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias, const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop, const uint32_t paddingBottom, const uint32_t strideW, const uint32_t strideH, const uint32_t multiplier, const uint32_t dilationWidth, const uint32_t dilationHeight, const ir::Activation activation, IPortableTensor *output, const std::shared_ptr< ExternalContext > &external_context)
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
void DepthwiseConvPerChannel(const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, int8_t *output_data, ruy::Context *ruy_context)
void DepthwiseConvPerChannel(const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const int32_t *filter_zeropoint, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
void DepthwiseConvHybridPerChannel(const DepthwiseConvParams &params, float *scaling_factors_ptr, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const float *per_channel_scale, int32_t *input_offset)
void DepthwiseConvOp(const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, float *padded_filter_data, bool pad_filter, float *filter_buffers_data, const Shape &output_shape, float *output_data)
void PortableAsymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, float *scaling_factor, int32_t *offset)
nnfw::cker::Shape getShape(const IPortableTensor *tensor)
void GetQuantizedConvolutionMultipliersAndShifts(float input_scale, float output_scale, const float *filter_scales, size_t filter_scales_size, int num_channels, std::vector< int32_t > &per_channel_output_multiplier, std::vector< int > &per_channel_output_shift)
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output, int32_t *act_min, int32_t *act_max)
void GetQuantizedConvolutionMultiplier(const IPortableTensor *input, const IPortableTensor *filter, const IPortableTensor *bias, const IPortableTensor *output, double *multiplier)
void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
Definition Dims.h:26
PaddingValues padding_values
Definition Types.h:234