ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Utils.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_UTILS_H__
19#define __NNFW_CKER_UTILS_H__
20
21#include "Shape.h"
22#include "ShapeIterator.h"
23
24#include "neon/neon_check.h"
25
26#include <algorithm>
27#include <cstdint>
28#include <numeric>
29#include <string>
30#include <fixedpoint/fixedpoint.h>
31
32namespace nnfw
33{
34namespace cker
35{
36
37template <typename T> struct is_quant8
38{
39 static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
40};
41
42template <typename T>
43inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
44{
45 return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
46}
47
48inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
49{
50 if (double_multiplier == 0.)
51 {
52 *quantized_multiplier = 0;
53 *shift = 0;
54 return;
55 }
56
57 const double q = std::frexp(double_multiplier, shift);
58 auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
59
60 assert(q_fixed <= (1ll << 31));
61 if (q_fixed == (1ll << 31))
62 {
63 q_fixed /= 2;
64 ++*shift;
65 }
66 assert(q_fixed <= std::numeric_limits<int32_t>::max());
67 // A shift amount smaller than -31 would cause all bits to be shifted out
68 // and thus all results would be zero. We implement that instead with
69 // q_fixed==0, so as to avoid hitting issues with right-shift
70 // operations with shift amounts greater than 31. Note that this happens
71 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
72 // that we're effectively flushing tiny double_multiplier's to zero.
73 // We could conceivably handle values in the range (roughly) [32, 63]
74 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
75 // the present handling is just doing 'flush denormals to zero'. We could
76 // reconsider and actually generate nonzero denormals if a need arises.
77 if (*shift < -31)
78 {
79 *shift = 0;
80 q_fixed = 0;
81 }
82 *quantized_multiplier = static_cast<int32_t>(q_fixed);
83}
84
85inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
86 int32_t *quantized_multiplier, int *left_shift)
87{
88 assert(double_multiplier < 1.0);
89 assert(double_multiplier > 0.0);
90 int shift;
91 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
92 assert(shift <= 0);
93 *left_shift = shift;
94}
95
96inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
97{
98 int left_shift = shift > 0 ? shift : 0;
99 int right_shift = shift > 0 ? 0 : -shift;
100 return gemmlowp::RoundingDivideByPOT(
101 gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
102 right_shift);
103}
104
105inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
106 int left_shift)
107{
108 return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
109}
110
112 int32_t quantized_multiplier,
113 int left_shift)
114{
115 return gemmlowp::RoundingDivideByPOT(
116 gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
117}
118
119#ifdef USE_NEON
120inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
121 int32_t quantized_multiplier, int32_t shift)
122{
123 const int left_shift = std::max(shift, 0);
124 const int right_shift = std::min(shift, 0);
125 int32x4x4_t result;
126
127 int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
128 int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
129 int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
130
131 result.val[0] = vrshlq_s32(
132 vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
133
134 result.val[1] = vrshlq_s32(
135 vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
136
137 result.val[2] = vrshlq_s32(
138 vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
139
140 result.val[3] = vrshlq_s32(
141 vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
142
143 return result;
144}
145#endif
146
147inline int NodeOffset(int b, int h, int w, int height, int width)
148{
149 return (b * height + h) * width + w;
150}
151
152inline int CountLeadingZeros(uint32_t integer_input)
153{
154 const uint32_t one_in_leading_positive = 1U << 31;
155 int leading_zeros = 0;
156 while (integer_input < one_in_leading_positive)
157 {
158 integer_input <<= 1;
159 ++leading_zeros;
160 }
161 return leading_zeros;
162}
163
164inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
165 int32_t *output_inv_sqrt, int *output_shift)
166{
167 assert(input >= 0);
168 if (input <= 1)
169 {
170 // Handle the input value 1 separately to avoid overflow in that case
171 // in the general computation below (b/143972021). Also handle 0 as if it
172 // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
173 // but rare/unrealistic input value. We can expect both to occur in some
174 // incompletely trained models, but probably not in fully trained models.
175 *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
176 *output_shift = 0;
177 return;
178 }
179 assert(input > 1);
180 *output_shift = 11;
181 while (input >= (1 << 29))
182 {
183 input /= 4;
184 ++*output_shift;
185 }
186 const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
187 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
188 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
189 *output_shift -= left_shift_bit_pairs;
190 input <<= 2 * left_shift_bit_pairs;
191 assert(input >= (1 << 27));
192 assert(input < (1 << 29));
193 using gemmlowp::FixedPoint;
194 using gemmlowp::Rescale;
195 using gemmlowp::SaturatingRoundingMultiplyByPOT;
196 // Using 3 integer bits gives us enough room for the internal arithmetic in
197 // this Newton-Raphson iteration.
198 using F3 = FixedPoint<int32_t, 3>;
199 using F0 = FixedPoint<int32_t, 0>;
200 const F3 fixedpoint_input = F3::FromRaw(input >> 1);
201 const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
202 const F3 fixedpoint_half_three =
203 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
204 // Newton-Raphson iteration
205 // Naive unoptimized starting guess: x = 1
206 F3 x = F3::One();
207 // Naive unoptimized number of iterations: 5
208 for (int i = 0; i < 5; i++)
209 {
210 const F3 x3 = Rescale<3>(x * x * x);
211 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
212 }
213 const F0 fixedpoint_half_sqrt_2 =
214 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
215 x = x * fixedpoint_half_sqrt_2;
216 *output_inv_sqrt = x.raw();
217 if (*output_shift < 0)
218 {
219 *output_inv_sqrt <<= -*output_shift;
220 *output_shift = 0;
221 }
222 // Convert right shift (right is positive) to left shift.
223 *output_shift *= reverse_shift;
224}
225
226// Comment from tensorflow lite:
227//
228// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
229// BROADCASTING.
230//
231// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
232// rectangular array of numbers.
233//
234// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
235// However, as Dims<N> is to be deprecated, this class exists as an adaptor
236// to enable simple unoptimized implementations of element-wise broadcasting
237// operations.
238template <int N> struct NdArrayDesc
239{
240 // The "extent" of each dimension. Indices along dimension d must be in the
241 // half-open interval [0, extents[d]).
242 int extents[N];
243
244 // The number of *elements* (not bytes) between consecutive indices of each
245 // dimension.
246 int strides[N];
247};
248
249// Comment from tensorflow lite:
250//
251// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
252// BROADCASTING.
253//
254// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
255inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
256{
257 assert(i0 >= 0 && i0 < desc.extents[0]);
258 assert(i1 >= 0 && i1 < desc.extents[1]);
259 assert(i2 >= 0 && i2 < desc.extents[2]);
260 assert(i3 >= 0 && i3 < desc.extents[3]);
261 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
262}
263
264inline int SubscriptToIndex(const NdArrayDesc<5> &desc, int i0, int i1, int i2, int i3, int i4)
265{
266 assert(i0 >= 0 && i0 < desc.extents[0]);
267 assert(i1 >= 0 && i1 < desc.extents[1]);
268 assert(i2 >= 0 && i2 < desc.extents[2]);
269 assert(i3 >= 0 && i3 < desc.extents[3]);
270 assert(i4 >= 0 && i4 < desc.extents[4]);
271 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3] +
272 i4 * desc.strides[4];
273}
274
275inline int SubscriptToIndex(const NdArrayDesc<6> &desc, int i0, int i1, int i2, int i3, int i4,
276 int i5)
277{
278 assert(i0 >= 0 && i0 < desc.extents[0]);
279 assert(i1 >= 0 && i1 < desc.extents[1]);
280 assert(i2 >= 0 && i2 < desc.extents[2]);
281 assert(i3 >= 0 && i3 < desc.extents[3]);
282 assert(i4 >= 0 && i4 < desc.extents[4]);
283 assert(i5 >= 0 && i5 < desc.extents[5]);
284 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3] +
285 i4 * desc.strides[4] + i5 * desc.strides[5];
286}
287
288template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter)
289{
290 int ret_indx = 0;
291 for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
292 {
293 assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
294 ret_indx += iter[idx] * desc->strides[idx];
295 }
296
297 return ret_indx;
298}
299
300// Copies dims to desc, calculating strides.
301template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out)
302{
303 int desc_stride = 1;
304 for (int i = N - 1; i >= 0; --i)
305 {
306 desc_out->extents[i] = input_shape.Dims(i);
307 desc_out->strides[i] = desc_stride;
308 desc_stride *= input_shape.Dims(i);
309 }
310}
311
312template <int N>
313inline void
314NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
315 NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
316{
317 assert(desc0_out != nullptr);
318 assert(desc1_out != nullptr);
319
320 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
321 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
322
323 // Copy dims to desc, calculating strides.
324 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
325 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
326
327 // Walk over each dimension. If the extents are equal do nothing.
328 // Otherwise, set the desc with extent 1 to have extent equal to the other and
329 // stride 0.
330 for (int i = 0; i < N; ++i)
331 {
332 const int extent0 = extended_input0_shape.Dims(i);
333 const int extent1 = extended_input1_shape.Dims(i);
334 if (extent0 != extent1)
335 {
336 if (extent0 == 1)
337 {
338 desc0_out->strides[i] = 0;
339 desc0_out->extents[i] = extent1;
340 }
341 else
342 {
343 assert(extent1 == 1);
344 desc1_out->strides[i] = 0;
345 desc1_out->extents[i] = extent0;
346 }
347 }
348 }
349}
350
351template <int N>
352inline void
353NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
354 const Shape &input2_shape, NdArrayDesc<N> *desc0_out,
355 NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out)
356{
357 assert(desc0_out != nullptr);
358 assert(desc1_out != nullptr);
359 assert(desc2_out != nullptr);
360
361 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
362 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
363 auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
364
365 // Copy dims to desc, calculating strides.
366 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
367 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
368 CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
369
370 // Walk over each dimension. If the extents are equal do nothing.
371 // Otherwise, set the desc with extent 1 to have extent equal to the other and
372 // stride 0.
373 for (int i = 0; i < N; ++i)
374 {
375 const int extent0 = extended_input0_shape.Dims(i);
376 const int extent1 = extended_input1_shape.Dims(i);
377 const int extent2 = extended_input2_shape.Dims(i);
378
379 int extent = extent0;
380 if (extent1 != 1)
381 extent = extent1;
382 if (extent2 != 1)
383 extent = extent2;
384
385 assert(extent0 == 1 || extent0 == extent);
386 assert(extent1 == 1 || extent1 == extent);
387 assert(extent2 == 1 || extent2 == extent);
388
389 if (!(extent0 == extent1 && extent1 == extent2))
390 {
391 if (extent0 == 1)
392 {
393 desc0_out->strides[i] = 0;
394 desc0_out->extents[i] = extent;
395 }
396 if (extent1 == 1)
397 {
398 desc1_out->strides[i] = 0;
399 desc1_out->extents[i] = extent;
400 }
401 if (extent2 == 1)
402 {
403 desc2_out->strides[i] = 0;
404 desc2_out->extents[i] = extent;
405 }
406 }
407 }
408}
409
410// Gets next index to iterate through a multidimensional array.
411inline bool NextIndex(const int num_dims, const int *dims, int *current)
412{
413 if (num_dims == 0)
414 {
415 return false;
416 }
417 assert(dims != nullptr);
418 assert(current != nullptr);
419 int carry = 1;
420 for (int idx = num_dims - 1; idx >= 0; --idx)
421 {
422 int current_val = current[idx] + carry;
423 assert(dims[idx] >= current_val);
424 if (dims[idx] == current_val)
425 {
426 current[idx] = 0;
427 }
428 else
429 {
430 current[idx] = current_val;
431 carry = 0;
432 break;
433 }
434 }
435 return (carry == 0);
436}
437
438// Gets offset of index if reducing on axis. When reducing, the flattened offset
439// will not change, if the input index changes on the given axis. For example,
440// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
441// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
442// offset.
443// TODO(kanlig): uses Dims to represent dimensions.
444inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
445 const int num_axis, const int *axis)
446{
447 if (num_dims == 0)
448 {
449 return 0;
450 }
451
452 assert(dims != nullptr);
453 assert(index != nullptr);
454
455 size_t offset = 0;
456 for (int idx = 0; idx < num_dims; ++idx)
457 {
458 // if we need to skip this axis
459 bool is_axis = false;
460 if (axis != nullptr)
461 {
462 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
463 {
464 if (idx == axis[axis_idx])
465 {
466 is_axis = true;
467 break;
468 }
469 }
470 }
471 if (!is_axis)
472 {
473 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
474 }
475 }
476 return offset;
477}
478
479template <typename T> void optimized_ops_preload_l1_keep(const T *ptr)
480{
481#ifdef __GNUC__
482 // builtin offered by GCC-compatible compilers including clang
483 __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
484#else
485 (void)ptr;
486#endif
487}
488
489// Writes randomly accessed values from `input` sequentially into `output`.
490template <typename T> class SequentialTensorWriter
491{
492public:
493 SequentialTensorWriter(const T *input_data, T *output_data)
494 : input_data_(input_data), output_ptr_(output_data)
495 {
496 }
497
498 void Write(int position) { *output_ptr_++ = input_data_[position]; }
499 void WriteN(int position, int len)
500 {
501 memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
502 output_ptr_ += len;
503 }
504
505private:
506 const T *input_data_;
507 T *output_ptr_;
508};
509
510inline std::ostream &operator<<(std::ostream &os, const Shape &shape)
511{
512 using std::begin;
513 using std::end;
514
515 std::string formatted =
516 std::accumulate(begin(shape), end(shape), std::string{"["},
517 [](std::string joined, ShapeIterator::value_type dim) {
518 return std::move(joined).append(std::to_string(dim)).append(",");
519 });
520
521 if (formatted.back() == '[')
522 {
523 formatted.push_back(']');
524 }
525 else
526 {
527 formatted.back() = ']';
528 }
529
530 os << formatted;
531 return os;
532}
533
534} // namespace cker
535} // namespace nnfw
536
537#endif // __NNFW_CKER_UTILS_H__
SequentialTensorWriter(const T *input_data, T *output_data)
Definition Utils.h:493
void WriteN(int position, int len)
Definition Utils.h:499
void Write(int position)
Definition Utils.h:498
int32_t Dims(int i) const
Definition Shape.h:106
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc< N > *desc_out)
Definition Utils.h:301
int SubscriptToIndexGeneric(const NdArrayDesc< N > *desc, int *iter)
Definition Utils.h:288
int NodeOffset(int b, int h, int w, int height, int width)
Definition Utils.h:147
ShapeIterator end(const Shape &s)
void NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
Definition Utils.h:314
bool NextIndex(const int num_dims, const int *dims, int *current)
Definition Utils.h:411
std::ostream & operator<<(std::ostream &os, const Shape &shape)
Definition Utils.h:510
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier, int *left_shift)
Definition Utils.h:85
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
Definition Utils.h:444
void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, int32_t *output_inv_sqrt, int *output_shift)
Definition Utils.h:164
T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition Utils.h:43
int CountLeadingZeros(uint32_t integer_input)
Definition Utils.h:152
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.h:48
int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:105
int SubscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
Definition Utils.h:255
void optimized_ops_preload_l1_keep(const T *ptr)
Definition Utils.h:479
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:111
Definition topk_v2.h:30
int32_t begin[5]
Definition Slice.cpp:33
decltype(std::declval< Shape >().Dims(0)) value_type
Definition of this iterator's traits that can be accessed by std::iterator_traits<It>
static constexpr bool value
Definition Utils.h:39