ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Conv.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_OPTIMIZED_CONV_H__
19#define __NNFW_CKER_OPTIMIZED_CONV_H__
20
21#include "OptimizedUtils.h"
22
24#include "cker/eigen/Utils.h"
28#include "cker/Shape.h"
29#include "cker/Types.h"
30
31#include <public/gemmlowp.h>
32#include <public/map.h>
33#include <fixedpoint/fixedpoint.h>
34
35#include <vector>
36#include <tuple>
37
38namespace nnfw
39{
40namespace cker
41{
42namespace optimized
43{
44
45std::mutex _gemmlowp_mutex;
46
48{
49 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
50 typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
51 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
52 gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
54 static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
55 int32_t output_multiplier, int output_left_shift,
56 int32_t output_activation_min, int32_t output_activation_max)
57 {
58 ColVectorMap bias_vector(bias_data, output_rows);
59 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
60 bias_addition_stage.bias_vector = bias_vector;
61 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent quantize_down_stage;
62 quantize_down_stage.result_offset_after_shift = output_offset;
63 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
64 quantize_down_stage.result_exponent = output_left_shift;
65 gemmlowp::OutputStageClamp clamp_stage;
66 clamp_stage.min = output_activation_min;
67 clamp_stage.max = output_activation_max;
68 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
69 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage,
70 saturating_cast_stage);
71 }
72};
73
74inline void AddBiasAndEvalActivationFunction(float output_activation_min,
75 float output_activation_max, const Shape &bias_shape,
76 const float *bias_data, const Shape &array_shape,
77 float *array_data)
78{
79 BiasAndClamp(output_activation_min, output_activation_max, bias_shape.FlatSize(), bias_data,
80 array_shape.FlatSize(), array_data);
81}
82
83inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
84 const Shape &filter_shape, const uint8_t *filter_data,
85 [[maybe_unused]] const Shape &bias_shape, const int32_t *bias_data,
86 const Shape &output_shape, uint8_t *output_data, const Shape &im2col_shape,
87 uint8_t *im2col_data)
88{
89 gemmlowp::GemmContext *gemm_context = gemm_support::GetGemmLowpContext();
90
91 const int stride_width = params.stride_width;
92 const int stride_height = params.stride_height;
93 const int dilation_width_factor = params.dilation_width_factor;
94 const int dilation_height_factor = params.dilation_height_factor;
95 const int32_t input_offset = params.input_offset;
96 const int32_t filter_offset = params.weights_offset;
97 const int32_t output_offset = params.output_offset;
98 const int32_t output_multiplier = params.output_multiplier;
99 const int output_shift = params.output_shift;
100 const int32_t output_activation_min = params.quantized_activation_min;
101 const int32_t output_activation_max = params.quantized_activation_max;
102 assert(input_shape.DimensionsCount() == 4);
103 assert(filter_shape.DimensionsCount() == 4);
104 assert(output_shape.DimensionsCount() == 4);
105
106 const uint8_t *gemm_input_data = nullptr;
107 const Shape *gemm_input_shape = nullptr;
108 const int filter_width = filter_shape.Dims(2);
109 const int filter_height = filter_shape.Dims(1);
110 const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
111 const bool need_im2col =
112 stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
113 if (need_dilated_im2col)
114 {
115 assert(im2col_data);
116 const int input_zero_point = -input_offset;
117 assert(input_zero_point >= 0);
118 assert(input_zero_point <= 255);
119 DilatedIm2col(params, input_zero_point, input_shape, input_data, filter_shape, output_shape,
120 im2col_data);
121 gemm_input_data = im2col_data;
122 gemm_input_shape = &im2col_shape;
123 }
124 else if (need_im2col)
125 {
126 assert(im2col_data);
127 const int input_zero_point = -input_offset;
128 assert(input_zero_point >= 0);
129 assert(input_zero_point <= 255);
130 Im2col(params, filter_height, filter_width, input_zero_point, input_shape, input_data,
131 im2col_shape, im2col_data);
132 gemm_input_data = im2col_data;
133 gemm_input_shape = &im2col_shape;
134 }
135 else
136 {
137 gemm_input_data = input_data;
138 gemm_input_shape = &input_shape;
139 }
140
141 const int gemm_input_rows = gemm_input_shape->Dims(3);
142 // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
143 // The root cause has not yet been identified though. Same applies below for
144 // the other calls commented out. This is a partial rollback of cl/196819423.
145 // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
146 const int gemm_input_cols =
147 gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
148 const int filter_rows = filter_shape.Dims(0);
149 // See b/79927784.
150 // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
151 const int filter_cols = filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
152 const int output_rows = output_shape.Dims(3);
153 // See b/79927784.
154 // const int output_cols = FlatSizeSkipDim(output_shape, 3);
155 const int output_cols = output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
156 assert(output_rows == filter_rows);
157 assert(output_cols == gemm_input_cols);
158 assert(filter_cols == gemm_input_rows);
159 assert(bias_shape.FlatSize() == output_rows);
160 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
161 filter_data, filter_rows, filter_cols);
162 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
163 gemm_input_data, gemm_input_rows, gemm_input_cols);
164 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
165 output_cols);
166 const auto &output_pipeline =
167 GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
168 output_shift, output_activation_min, output_activation_max);
169
170 std::lock_guard<std::mutex> lock_guard(_gemmlowp_mutex);
171 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
172 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
173 output_pipeline);
174}
175
176} // namespace optimized
177
178namespace multithreaded
179{
180namespace
181{
182template <class T> class EigenTensorConvFunctor
183{
184private:
185 Eigen::PaddingType RuntimePadding2EigenPadding(PaddingType padding)
186 {
187 switch (padding)
188 {
190 return Eigen::PADDING_VALID;
192 return Eigen::PADDING_SAME;
194 assert(false); // should never get here.
195 return Eigen::PADDING_VALID;
196 }
197 return Eigen::PADDING_SAME; // Prevent compiler warning about missing
198 // return
199 }
200
201public:
202 void operator()(const Eigen::ThreadPoolDevice &device, const T *input_data, int input_batches,
203 int input_height, int input_width, int input_depth, const T *filter_data,
204 int filter_height, int filter_width, int filter_count, int stride_rows,
205 int stride_cols, int pad_height, int pad_width, nnfw::cker::PaddingType padding,
206 T *output_data, int output_height, int output_width)
207 {
208 const bool is_1x1_kernel =
209 (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
210 const bool is_same_height_width =
211 (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
212 pad_height == 0);
213 if (is_1x1_kernel || is_same_height_width)
214 {
215 // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
216 // - output (input_batches * conv_width, filter_count)
217 // - input (input_batches * conv_width, input_depth)
218 // - filter (input_depth, filter_count)
219 // is_same_height_width: If the input data and filter have the same height/width, the 2D
220 // convolution is reduced to matrix multiplication.
221 // - output (input_batches, filter_count)
222 // - input (input_batches, filter_width * filter_height * input_depth)
223 // - filter (filter_width * filter_height * input_depth, filter_count)
224 const int conv_width = output_height * output_width;
225 int io_col = input_batches;
226 int filter_col = input_depth * filter_width * filter_height;
227 if (is_1x1_kernel)
228 {
229 io_col *= conv_width;
230 }
231 Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
232 dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
233 eigen_support::EigenMatrix output(output_data, io_col, filter_count);
234 eigen_support::ConstEigenMatrix input(input_data, io_col, filter_col);
235 eigen_support::ConstEigenMatrix filter(filter_data, filter_col, filter_count);
237 dim_pair);
238 }
239 else
240 {
241 eigen_support::EigenTensor output(output_data, input_batches, output_height, output_width,
242 filter_count);
243 eigen_support::ConstEigenTensor input(input_data, input_batches, input_height, input_width,
244 input_depth);
245 eigen_support::ConstEigenTensor filter(filter_data, filter_height, filter_width, input_depth,
246 filter_count);
247 output.device(device) = Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
248 RuntimePadding2EigenPadding(padding));
249 }
250 }
251};
252} // namespace
253
254inline void Conv(const ConvParams &params, const Shape &input_shape, const float *input_data,
255 const Shape &filter_shape, const float *filter_data, const Shape &bias_shape,
256 const float *bias_data, const Shape &output_shape, float *output_data)
257{
258 const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
259
260 const int stride_width = params.stride_width;
261 const int stride_height = params.stride_height;
262 const PaddingType padding = params.padding_type;
263 const int pad_width = params.padding_values.width;
264 const int pad_height = params.padding_values.height;
265 const float output_activation_min = params.float_activation_min;
266 const float output_activation_max = params.float_activation_max;
267 assert(input_shape.DimensionsCount() == 4);
268 assert(filter_shape.DimensionsCount() == 4);
269 assert(output_shape.DimensionsCount() == 4);
270
271 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
272 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
273 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
274 const int input_height = input_shape.Dims(1);
275 const int input_width = input_shape.Dims(2);
276 const int filter_height = filter_shape.Dims(1);
277 const int filter_width = filter_shape.Dims(2);
278 const int output_height = output_shape.Dims(1);
279 const int output_width = output_shape.Dims(2);
280
281 EigenTensorConvFunctor<float> conv_functor;
282 conv_functor(device, input_data, batches, input_height, input_width, input_depth, filter_data,
283 filter_height, filter_width, output_depth, stride_height, stride_width, pad_height,
284 pad_width, padding, output_data, output_height, output_width);
285
286 optimized::AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
287 bias_shape, bias_data, output_shape, output_data);
288}
289
290} // namespace multithreaded
291} // namespace cker
292} // namespace nnfw
293
294#endif // __NNFW_CKER_OPTIMIZED_CONV_H__
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
int FlatSize() const
Definition Shape.h:181
const luci_interpreter::RuntimeShape output_shape
Eigen::TensorMap< Eigen::Tensor< const float, 2, Eigen::RowMajor, Eigen::DenseIndex >, Eigen::Aligned > ConstEigenMatrix
Eigen::TensorMap< Eigen::Tensor< float, 4, Eigen::RowMajor, Eigen::DenseIndex >, Eigen::Aligned > EigenTensor
Eigen::TensorMap< Eigen::Tensor< float, 2, Eigen::RowMajor, Eigen::DenseIndex >, Eigen::Aligned > EigenMatrix
const Eigen::ThreadPoolDevice * GetThreadPoolDevice()
Eigen::TensorMap< Eigen::Tensor< const float, 4, Eigen::RowMajor, Eigen::DenseIndex >, Eigen::Aligned > ConstEigenTensor
gemmlowp::GemmContext * GetGemmLowpContext()
Definition GEMMSupport.h:57
void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const Shape &output_shape, T *im2col_data, const int32_t *zero_bytes, const int zero_bytes_len)
std::mutex _gemmlowp_mutex
Definition Conv.h:45
void Im2col(const ConvParams &params, int kheight, int kwidth, uint8_t zero_byte, const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
void AddBiasAndEvalActivationFunction(float output_activation_min, float output_activation_max, const Shape &bias_shape, const float *bias_data, const Shape &array_shape, float *array_data)
Definition Conv.h:74
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data, int array_size, float *array_data)
Definition Common.h:29
PaddingType
Definition Types.h:41
Definition topk_v2.h:30
int16_t stride_height
Definition Types.h:146
PaddingValues padding_values
Definition Types.h:143
float float_activation_max
Definition Types.h:161
int32_t output_multiplier
Definition Types.h:154
int32_t weights_offset
Definition Types.h:152
int32_t output_offset
Definition Types.h:153
int16_t dilation_width_factor
Definition Types.h:147
float float_activation_min
Definition Types.h:160
int32_t quantized_activation_max
Definition Types.h:158
PaddingType padding_type
Definition Types.h:142
int16_t dilation_height_factor
Definition Types.h:148
int32_t quantized_activation_min
Definition Types.h:157
std::tuple< gemmlowp::OutputStageBiasAddition< ColVectorMap >, gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8 > Pipeline
Definition Conv.h:53
gemmlowp::VectorMap< const int32_t, gemmlowp::VectorShape::Col > ColVectorMap
Definition Conv.h:49
static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset, int32_t output_multiplier, int output_left_shift, int32_t output_activation_min, int32_t output_activation_max)
Definition Conv.h:54