18#ifndef __NNFW_CKER_UTILS_H__
19#define __NNFW_CKER_UTILS_H__
24#include "neon/neon_check.h"
30#include <fixedpoint/fixedpoint.h>
39 static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
45 return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
48inline void QuantizeMultiplier(
double double_multiplier, int32_t *quantized_multiplier,
int *shift)
50 if (double_multiplier == 0.)
52 *quantized_multiplier = 0;
57 const double q = std::frexp(double_multiplier, shift);
58 auto q_fixed =
static_cast<int64_t
>(round(q * (1ll << 31)));
60 assert(q_fixed <= (1ll << 31));
61 if (q_fixed == (1ll << 31))
66 assert(q_fixed <= std::numeric_limits<int32_t>::max());
82 *quantized_multiplier =
static_cast<int32_t
>(q_fixed);
86 int32_t *quantized_multiplier,
int *left_shift)
88 assert(double_multiplier < 1.0);
89 assert(double_multiplier > 0.0);
98 int left_shift = shift > 0 ? shift : 0;
99 int right_shift = shift > 0 ? 0 : -shift;
100 return gemmlowp::RoundingDivideByPOT(
101 gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
108 return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
112 int32_t quantized_multiplier,
115 return gemmlowp::RoundingDivideByPOT(
116 gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
120inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
121 int32_t quantized_multiplier, int32_t shift)
123 const int left_shift = std::max(shift, 0);
124 const int right_shift = std::min(shift, 0);
127 int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
128 int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
129 int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
131 result.val[0] = vrshlq_s32(
132 vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
134 result.val[1] = vrshlq_s32(
135 vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
137 result.val[2] = vrshlq_s32(
138 vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
140 result.val[3] = vrshlq_s32(
141 vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
147inline int NodeOffset(
int b,
int h,
int w,
int height,
int width)
149 return (b * height + h) * width + w;
154 const uint32_t one_in_leading_positive = 1U << 31;
155 int leading_zeros = 0;
156 while (integer_input < one_in_leading_positive)
161 return leading_zeros;
165 int32_t *output_inv_sqrt,
int *output_shift)
175 *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
181 while (input >= (1 << 29))
186 const unsigned max_left_shift_bits =
CountLeadingZeros(
static_cast<uint32_t
>(input)) - 1;
187 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
188 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
189 *output_shift -= left_shift_bit_pairs;
190 input <<= 2 * left_shift_bit_pairs;
191 assert(input >= (1 << 27));
192 assert(input < (1 << 29));
193 using gemmlowp::FixedPoint;
194 using gemmlowp::Rescale;
195 using gemmlowp::SaturatingRoundingMultiplyByPOT;
198 using F3 = FixedPoint<int32_t, 3>;
199 using F0 = FixedPoint<int32_t, 0>;
200 const F3 fixedpoint_input = F3::FromRaw(input >> 1);
201 const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
202 const F3 fixedpoint_half_three =
203 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
208 for (
int i = 0; i < 5; i++)
210 const F3 x3 = Rescale<3>(x * x * x);
211 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
213 const F0 fixedpoint_half_sqrt_2 =
214 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
215 x = x * fixedpoint_half_sqrt_2;
216 *output_inv_sqrt = x.raw();
217 if (*output_shift < 0)
219 *output_inv_sqrt <<= -*output_shift;
223 *output_shift *= reverse_shift;
257 assert(i0 >= 0 && i0 < desc.
extents[0]);
258 assert(i1 >= 0 && i1 < desc.
extents[1]);
259 assert(i2 >= 0 && i2 < desc.
extents[2]);
260 assert(i3 >= 0 && i3 < desc.
extents[3]);
267 for (
size_t idx = 0; idx < static_cast<size_t>(N); idx++)
269 assert(iter[idx] >= 0 && iter[idx] < desc->
extents[idx]);
270 ret_indx += iter[idx] * desc->
strides[idx];
280 for (
int i = N - 1; i >= 0; --i)
283 desc_out->
strides[i] = desc_stride;
284 desc_stride *= input_shape.
Dims(i);
293 assert(desc0_out !=
nullptr);
294 assert(desc1_out !=
nullptr);
296 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
297 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
300 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
301 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
306 for (
int i = 0; i < N; ++i)
308 const int extent0 = extended_input0_shape.Dims(i);
309 const int extent1 = extended_input1_shape.Dims(i);
310 if (extent0 != extent1)
315 desc0_out->
extents[i] = extent1;
319 assert(extent1 == 1);
321 desc1_out->
extents[i] = extent0;
333 assert(desc0_out !=
nullptr);
334 assert(desc1_out !=
nullptr);
335 assert(desc2_out !=
nullptr);
337 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
338 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
339 auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
342 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
343 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
344 CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
349 for (
int i = 0; i < N; ++i)
351 const int extent0 = extended_input0_shape.Dims(i);
352 const int extent1 = extended_input1_shape.Dims(i);
353 const int extent2 = extended_input2_shape.Dims(i);
355 int extent = extent0;
361 assert(extent0 == 1 || extent0 == extent);
362 assert(extent1 == 1 || extent1 == extent);
363 assert(extent2 == 1 || extent2 == extent);
365 if (!(extent0 == extent1 && extent1 == extent2))
370 desc0_out->
extents[i] = extent;
375 desc1_out->
extents[i] = extent;
380 desc2_out->
extents[i] = extent;
387inline bool NextIndex(
const int num_dims,
const int *dims,
int *current)
393 assert(dims !=
nullptr);
394 assert(current !=
nullptr);
396 for (
int idx = num_dims - 1; idx >= 0; --idx)
398 int current_val = current[idx] + carry;
399 assert(dims[idx] >= current_val);
400 if (dims[idx] == current_val)
406 current[idx] = current_val;
421 const int num_axis,
const int *axis)
428 assert(dims !=
nullptr);
429 assert(index !=
nullptr);
432 for (
int idx = 0; idx < num_dims; ++idx)
435 bool is_axis =
false;
438 for (
int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
440 if (idx == axis[axis_idx])
449 offset =
offset *
static_cast<size_t>(dims[idx]) +
static_cast<size_t>(index[idx]);
459 __builtin_prefetch(ptr, 0, 3);
470 : input_data_(input_data), output_ptr_(output_data)
474 void Write(
int position) { *output_ptr_++ = input_data_[position]; }
477 memcpy(output_ptr_, &input_data_[position],
sizeof(T) * len);
482 const T *input_data_;
491 std::string formatted =
492 std::accumulate(
begin(shape),
end(shape), std::string{
"["},
494 return std::move(joined).append(std::to_string(dim)).append(
",");
497 if (formatted.back() ==
'[')
499 formatted.push_back(
']');
503 formatted.back() =
']';
SequentialTensorWriter(const T *input_data, T *output_data)
void WriteN(int position, int len)
int32_t Dims(int i) const
__global uchar * offset(const Image *img, int x, int y)
void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc< N > *desc_out)
int SubscriptToIndexGeneric(const NdArrayDesc< N > *desc, int *iter)
int NodeOffset(int b, int h, int w, int height, int width)
ShapeIterator end(const Shape &s)
void NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
bool NextIndex(const int num_dims, const int *dims, int *current)
std::ostream & operator<<(std::ostream &os, const Shape &shape)
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier, int *left_shift)
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, int32_t *output_inv_sqrt, int *output_shift)
T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
int CountLeadingZeros(uint32_t integer_input)
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, int left_shift)
int SubscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
void optimized_ops_preload_l1_keep(const T *ptr)
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
decltype(std::declval< Shape >().Dims(0)) value_type
Definition of this iterator's traits that can be accessed by std::iterator_traits<It>
static constexpr bool value