ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Reduce.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_REDUCE_H__
19#define __NNFW_CKER_REDUCE_H__
20
21#include "cker/Shape.h"
22#include "cker/Types.h"
23#include "cker/Utils.h"
25
26namespace nnfw
27{
28namespace cker
29{
30
31// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
32// This method iterates through input data and reduce elements along the
33// dimensions given in axis.
34
35#ifdef USE_NEON
36inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
37 float *output_data)
38{
39 const auto input_dims = input_shape.DimsData();
40 const auto input_num_dims = input_shape.DimensionsCount();
41
42 int input_size = 1;
43 int reduce_size = 0;
44 for (int idx = 0; idx < input_num_dims - 1; idx++)
45 {
46 input_size *= input_dims[idx];
47 }
48 reduce_size = input_dims[input_num_dims - 1];
49 int offset = 0;
50 for (int idx = 0; idx < input_size; idx++)
51 {
52 int r_idx = 0;
53 float tmp_data[4] = {
54 0,
55 };
56 float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
57 for (; r_idx <= reduce_size - 32; r_idx += 32)
58 {
59 float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
60 float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
61 float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
62 float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
63 float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16);
64 float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20);
65 float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24);
66 float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28);
67
68 float32x4_t x0 = vaddq_f32(a10, a20);
69 float32x4_t x1 = vaddq_f32(a11, a21);
70 float32x4_t x2 = vaddq_f32(a12, a22);
71 float32x4_t x3 = vaddq_f32(a13, a23);
72
73 float32x4_t y0 = vaddq_f32(x0, x1);
74 float32x4_t y1 = vaddq_f32(x2, x3);
75 float32x4_t y2 = vaddq_f32(y0, y1);
76 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
77 }
78 for (; r_idx <= reduce_size - 16; r_idx += 16)
79 {
80 float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
81 float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
82 float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
83 float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
84
85 float32x4_t x0 = vaddq_f32(a10, a11);
86 float32x4_t x1 = vaddq_f32(a12, a13);
87
88 float32x4_t y0 = vaddq_f32(x0, x1);
89 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
90 }
91 for (; r_idx <= reduce_size - 8; r_idx += 8)
92 {
93 float32x4_t a1 = vld1q_f32(input_data + offset + r_idx);
94 float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4);
95 float32x4_t x = vaddq_f32(a1, a2);
96 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
97 }
98 vst1q_f32(tmp_data, tmp_data_32x4);
99 output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
100
101 for (; r_idx < reduce_size; r_idx++)
102 {
103 if (r_idx == 0)
104 {
106 }
107 else
108 {
109 output_data[idx] += input_data[offset + r_idx];
110 }
111 }
112 offset += reduce_size;
113 }
114}
115#endif // NEON
116
117template <typename In, typename Out>
118inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
119 const int *axis, const int num_axis, int *input_iter,
120 Out reducer(const Out current, const In in), Out *output_data)
121{
122 const auto input_dims = input_shape.DimsData();
123 const auto input_num_dims = input_shape.DimensionsCount();
124
125 // Reset input iterator.
126 if (num_axis == 1 && axis[0] == input_num_dims - 1)
127 {
128 int input_size = 1;
129 int reduce_size = 0;
130 for (int idx = 0; idx < input_num_dims - 1; idx++)
131 {
132 input_size *= input_dims[idx];
133 }
134 reduce_size = input_dims[input_num_dims - 1];
135 for (int idx = 0; idx < input_size; idx++)
136 {
137 for (int r_idx = 0; r_idx < reduce_size; r_idx++)
138 {
139 if (r_idx == 0)
140 {
141 output_data[idx] = input_data[idx * reduce_size];
142 }
143 else
144 {
145 output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
146 }
147 }
148 }
149 return true;
150 }
151
152 for (int idx = 0; idx < input_num_dims; ++idx)
153 {
154 input_iter[idx] = 0;
155 }
156 // Iterate through input_data.
157 do
158 {
159 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
160 size_t output_offset =
161 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
162 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
163 } while (NextIndex(input_num_dims, input_dims, input_iter));
164 return true;
165}
166
167// This method parses the input 'axis' to remove duplicates and handle negative
168// values, and returns a valid 'out_axis'
169inline bool ResolveAxis(const int num_dims, const std::vector<int> &axes, int *out_axis,
170 int *out_num_axis)
171{
172 auto num_axis = axes.size();
173 auto axis = axes.data();
174
175 *out_num_axis = 0; // Just in case.
176 // Short-circuit axis resolution for scalars; the axis will go unused.
177 if (num_dims == 0)
178 {
179 return true;
180 }
181 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
182 for (size_t idx = 0; idx < num_axis; ++idx)
183 {
184 // Handle negative index. A positive index 'p_idx' can be represented as a
185 // negative index 'n_idx' as: n_idx = p_idx-num_dims
186 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
187 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
188 assert(current >= 0 && current < num_dims);
189 bool is_dup = false;
190 for (int j = 0; j < *out_num_axis; ++j)
191 {
192 if (out_axis[j] == current)
193 {
194 is_dup = true;
195 break;
196 }
197 }
198 if (!is_dup)
199 {
200 out_axis[*out_num_axis] = current;
201 *out_num_axis += 1;
202 }
203 }
204 return true;
205}
206
207template <typename T>
208inline bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
209{
210 const auto dims = shape.DimsData();
211 const auto num_dims = shape.DimensionsCount();
212 size_t num_elements = 1;
213 for (int idx = 0; idx < num_dims; ++idx)
214 {
215 size_t current = static_cast<size_t>(dims[idx]);
216 // Overflow prevention.
217 if (num_elements > std::numeric_limits<size_t>::max() / current)
218 {
219 return false;
220 }
221 num_elements *= current;
222 }
223 for (size_t idx = 0; idx < num_elements; ++idx)
224 {
225 data[idx] = init_value;
226 }
227 return true;
228}
229
231{
232public:
233 Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
234
235 void prepare(size_t temp_index_size, size_t resolved_axis_size)
236 {
237 if (_prepared)
238 return;
239
240 // prepare space for temp_index and resolved_axis
241 if (temp_index_size > kMaxSmallSize)
242 _temp_index.resize(temp_index_size);
243 if (resolved_axis_size > kMaxSmallSize)
244 _resolved_axis.resize(resolved_axis_size);
245 _prepared = true;
246 }
247
248 // Computes the generic value (i.e., sum/max/min/prod) of elements across
249 // dimensions given in axis. It needs to pass in init_value and reducer.
250 template <typename T>
251 inline bool ReduceGeneric(const Shape &input_shape, const T *input_data,
252 const Shape &output_shape, T *output_data, const std::vector<int> &axes,
253 bool, T init_value, T reducer(const T current, const T in))
254 {
255 // Reset output data.
256 if (!InitTensorDataForReduce(output_shape, init_value, output_data))
257 {
258 return false;
259 }
260
261 // Resolve axis.
262 int num_resolved_axis = 0;
263 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
264 {
265 return false;
266 }
267
268 return ReduceImpl<T, T>(input_data, input_shape, output_shape, resolved_axis_data(),
269 num_resolved_axis, temp_index_data(), reducer, output_data);
270 }
271
272 // Computes the mean of elements across dimensions given in axis.
273 // It does so in two stages, first calculates the sum of elements along the axis
274 // then divides it by the number of element in axis for quantized values.
275 template <typename T, typename U>
276 inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
277 const Shape &input_shape, T *output_data,
278 int32_t output_zero_point, float output_scale,
279 const Shape &output_shape, const std::vector<int> &axes,
280 bool /*keep_dims*/, U *temp_sum, bool compute_sum,
281 U reducer(const U current, const T in))
282 {
283 // Reset output data.
284 size_t num_outputs = 1;
285 for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
286 {
287 size_t current = static_cast<size_t>(output_shape.Dims(idx));
288 // Overflow prevention.
289 if (num_outputs > std::numeric_limits<size_t>::max() / current)
290 {
291 return false;
292 }
293 num_outputs *= current;
294 }
295 for (size_t idx = 0; idx < num_outputs; ++idx)
296 {
297 output_data[idx] = T();
298 temp_sum[idx] = U();
299 }
300
301 // Resolve axis.
302 int num_resolved_axis = 0;
303 if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
304 {
305 return false;
306 }
307
308 if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
309 num_resolved_axis, temp_index_data(), reducer, temp_sum))
310 {
311 return false;
312 }
313
314 // Calculate mean by dividing output_data by num of aggregated element.
315 size_t num_elements_in_axis = 1;
316 for (int idx = 0; idx < num_resolved_axis; ++idx)
317 {
318 size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
319 // Overflow prevention.
320 if (current > static_cast<size_t>(std::numeric_limits<size_t>::max() / num_elements_in_axis))
321 {
322 return false;
323 }
324 num_elements_in_axis *= current;
325 }
326
327 if (num_elements_in_axis > 0)
328 {
329 const float scale = input_scale / output_scale;
330 if (compute_sum)
331 {
332 // TODO(b/116341117): Eliminate float and do this completely in 8bit.
333 const float bias = -input_zero_point * scale * num_elements_in_axis;
334 for (size_t idx = 0; idx < num_outputs; ++idx)
335 {
336 const U value =
337 static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
338 output_data[idx] = static_cast<T>(value);
339 }
340 }
341 else
342 {
343 const float bias = -input_zero_point * scale;
344 for (size_t idx = 0; idx < num_outputs; ++idx)
345 {
346 float float_mean =
347 static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
348 float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
349 static_cast<float>(std::numeric_limits<T>::max()));
350 result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
351 output_data[idx] = static_cast<T>(result);
352 }
353 }
354 }
355 return true;
356 }
357
358 inline int32_t *resolved_axis_data(void)
359 {
360 return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
361 }
362 inline int32_t *temp_index_data(void)
363 {
364 return _temp_index.size() ? _temp_index.data() : _temp_index_small;
365 }
366
367private:
368 std::vector<int> _temp_index;
369 std::vector<int> _resolved_axis;
370 bool _prepared;
371 static constexpr int kMaxSmallSize = 4;
372 int _temp_index_small[kMaxSmallSize];
373 int _resolved_axis_small[kMaxSmallSize];
374};
375
376} // namespace cker
377} // namespace nnfw
378
379#endif // __NNFW_CKER_REDUCE_H__
bool ReduceGeneric(const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data, const std::vector< int > &axes, bool, T init_value, T reducer(const T current, const T in))
Definition Reduce.h:251
bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale, const Shape &input_shape, T *output_data, int32_t output_zero_point, float output_scale, const Shape &output_shape, const std::vector< int > &axes, bool, U *temp_sum, bool compute_sum, U reducer(const U current, const T in))
Definition Reduce.h:276
int32_t * resolved_axis_data(void)
Definition Reduce.h:358
void prepare(size_t temp_index_size, size_t resolved_axis_size)
Definition Reduce.h:235
int32_t * temp_index_data(void)
Definition Reduce.h:362
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
int32_t * DimsData()
Definition Shape.h:112
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
const luci_interpreter::RuntimeShape output_shape
list input_data
Definition infer.py:29
bool NextIndex(const int num_dims, const int *dims, int *current)
Definition Utils.h:387
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
Definition Utils.h:420
bool ResolveAxis(const int num_dims, const std::vector< int > &axes, int *out_axis, int *out_num_axis)
Definition Reduce.h:169
bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
Definition Reduce.h:208
bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &, const int *axis, const int num_axis, int *input_iter, Out reducer(const Out current, const In in), Out *output_data)
Definition Reduce.h:118
Definition topk_v2.h:30
Definition Shape.h:28