18#ifndef __NNFW_CKER_REDUCE_H__
19#define __NNFW_CKER_REDUCE_H__
36inline void OptimizedReduceSum(
const float *input_data,
const Shape &input_shape,
39 const auto input_dims = input_shape.DimsData();
40 const auto input_num_dims = input_shape.DimensionsCount();
44 for (
int idx = 0; idx < input_num_dims - 1; idx++)
46 input_size *= input_dims[idx];
48 reduce_size = input_dims[input_num_dims - 1];
50 for (
int idx = 0; idx < input_size; idx++)
56 float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
57 for (; r_idx <= reduce_size - 32; r_idx += 32)
59 float32x4_t a10 = vld1q_f32(input_data +
offset + r_idx);
60 float32x4_t a11 = vld1q_f32(input_data +
offset + r_idx + 4);
61 float32x4_t a12 = vld1q_f32(input_data +
offset + r_idx + 8);
62 float32x4_t a13 = vld1q_f32(input_data +
offset + r_idx + 12);
63 float32x4_t a20 = vld1q_f32(input_data +
offset + r_idx + 16);
64 float32x4_t a21 = vld1q_f32(input_data +
offset + r_idx + 20);
65 float32x4_t a22 = vld1q_f32(input_data +
offset + r_idx + 24);
66 float32x4_t a23 = vld1q_f32(input_data +
offset + r_idx + 28);
68 float32x4_t x0 = vaddq_f32(a10, a20);
69 float32x4_t x1 = vaddq_f32(a11, a21);
70 float32x4_t x2 = vaddq_f32(a12, a22);
71 float32x4_t x3 = vaddq_f32(a13, a23);
73 float32x4_t y0 = vaddq_f32(x0, x1);
74 float32x4_t y1 = vaddq_f32(x2, x3);
75 float32x4_t y2 = vaddq_f32(y0, y1);
76 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
78 for (; r_idx <= reduce_size - 16; r_idx += 16)
80 float32x4_t a10 = vld1q_f32(input_data +
offset + r_idx);
81 float32x4_t a11 = vld1q_f32(input_data +
offset + r_idx + 4);
82 float32x4_t a12 = vld1q_f32(input_data +
offset + r_idx + 8);
83 float32x4_t a13 = vld1q_f32(input_data +
offset + r_idx + 12);
85 float32x4_t x0 = vaddq_f32(a10, a11);
86 float32x4_t x1 = vaddq_f32(a12, a13);
88 float32x4_t y0 = vaddq_f32(x0, x1);
89 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
91 for (; r_idx <= reduce_size - 8; r_idx += 8)
93 float32x4_t a1 = vld1q_f32(input_data +
offset + r_idx);
94 float32x4_t a2 = vld1q_f32(input_data +
offset + r_idx + 4);
95 float32x4_t x = vaddq_f32(a1, a2);
96 tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
98 vst1q_f32(tmp_data, tmp_data_32x4);
99 output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
101 for (; r_idx < reduce_size; r_idx++)
117template <
typename In,
typename Out>
119 const int *axis,
const int num_axis,
int *input_iter,
120 Out reducer(
const Out current,
const In in), Out *output_data)
122 const auto input_dims = input_shape.
DimsData();
126 if (num_axis == 1 && axis[0] == input_num_dims - 1)
130 for (
int idx = 0; idx < input_num_dims - 1; idx++)
132 input_size *= input_dims[idx];
134 reduce_size = input_dims[input_num_dims - 1];
135 for (
int idx = 0; idx < input_size; idx++)
137 for (
int r_idx = 0; r_idx < reduce_size; r_idx++)
141 output_data[idx] = input_data[idx * reduce_size];
145 output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
152 for (
int idx = 0; idx < input_num_dims; ++idx)
159 size_t input_offset =
ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0,
nullptr);
160 size_t output_offset =
162 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
163 }
while (
NextIndex(input_num_dims, input_dims, input_iter));
169inline bool ResolveAxis(
const int num_dims,
const std::vector<int> &axes,
int *out_axis,
172 auto num_axis = axes.size();
173 auto axis = axes.data();
182 for (
size_t idx = 0; idx < num_axis; ++idx)
187 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
188 assert(current >= 0 && current < num_dims);
190 for (
int j = 0; j < *out_num_axis; ++j)
192 if (out_axis[j] == current)
200 out_axis[*out_num_axis] = current;
212 size_t num_elements = 1;
213 for (
int idx = 0; idx < num_dims; ++idx)
215 size_t current =
static_cast<size_t>(dims[idx]);
217 if (num_elements > std::numeric_limits<size_t>::max() / current)
221 num_elements *= current;
223 for (
size_t idx = 0; idx < num_elements; ++idx)
225 data[idx] = init_value;
233 Reduce() : _temp_index(), _resolved_axis(), _prepared(false) {}
235 void prepare(
size_t temp_index_size,
size_t resolved_axis_size)
241 if (temp_index_size > kMaxSmallSize)
242 _temp_index.resize(temp_index_size);
243 if (resolved_axis_size > kMaxSmallSize)
244 _resolved_axis.resize(resolved_axis_size);
250 template <
typename T>
253 bool, T init_value, T reducer(
const T current,
const T in))
262 int num_resolved_axis = 0;
275 template <
typename T,
typename U>
277 const Shape &input_shape, T *output_data,
278 int32_t output_zero_point,
float output_scale,
280 bool , U *temp_sum,
bool compute_sum,
281 U reducer(
const U current,
const T in))
284 size_t num_outputs = 1;
285 for (
int idx = 0; idx <
output_shape.DimensionsCount(); ++idx)
287 size_t current =
static_cast<size_t>(
output_shape.Dims(idx));
289 if (num_outputs > std::numeric_limits<size_t>::max() / current)
293 num_outputs *= current;
295 for (
size_t idx = 0; idx < num_outputs; ++idx)
297 output_data[idx] = T();
302 int num_resolved_axis = 0;
315 size_t num_elements_in_axis = 1;
316 for (
int idx = 0; idx < num_resolved_axis; ++idx)
320 if (current >
static_cast<size_t>(std::numeric_limits<size_t>::max() / num_elements_in_axis))
324 num_elements_in_axis *= current;
327 if (num_elements_in_axis > 0)
329 const float scale = input_scale / output_scale;
333 const float bias = -input_zero_point * scale * num_elements_in_axis;
334 for (
size_t idx = 0; idx < num_outputs; ++idx)
337 static_cast<U
>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
338 output_data[idx] =
static_cast<T
>(value);
343 const float bias = -input_zero_point * scale;
344 for (
size_t idx = 0; idx < num_outputs; ++idx)
347 static_cast<float>(temp_sum[idx]) /
static_cast<float>(num_elements_in_axis);
348 float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
349 static_cast<float>(std::numeric_limits<T>::max()));
350 result = std::max(result,
static_cast<float>(std::numeric_limits<T>::min()));
351 output_data[idx] =
static_cast<T
>(result);
360 return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
364 return _temp_index.size() ? _temp_index.data() : _temp_index_small;
368 std::vector<int> _temp_index;
369 std::vector<int> _resolved_axis;
371 static constexpr int kMaxSmallSize = 4;
372 int _temp_index_small[kMaxSmallSize];
373 int _resolved_axis_small[kMaxSmallSize];
bool ReduceGeneric(const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data, const std::vector< int > &axes, bool, T init_value, T reducer(const T current, const T in))
bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale, const Shape &input_shape, T *output_data, int32_t output_zero_point, float output_scale, const Shape &output_shape, const std::vector< int > &axes, bool, U *temp_sum, bool compute_sum, U reducer(const U current, const T in))
int32_t * resolved_axis_data(void)
void prepare(size_t temp_index_size, size_t resolved_axis_size)
int32_t * temp_index_data(void)
int32_t DimensionsCount() const
int32_t Dims(int i) const
__global uchar * offset(const Image *img, int x, int y)
const luci_interpreter::RuntimeShape output_shape
bool NextIndex(const int num_dims, const int *dims, int *current)
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
bool ResolveAxis(const int num_dims, const std::vector< int > &axes, int *out_axis, int *out_num_axis)
bool InitTensorDataForReduce(const Shape &shape, const T init_value, T *data)
bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &, const int *axis, const int num_axis, int *input_iter, Out reducer(const Out current, const In in), Out *output_data)