ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Utils.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_UTILS_H__
19#define __NNFW_CKER_UTILS_H__
20
21#include "Shape.h"
22#include "ShapeIterator.h"
23
24#include "neon/neon_check.h"
25
26#include <algorithm>
27#include <cstdint>
28#include <numeric>
29#include <string>
30#include <fixedpoint/fixedpoint.h>
31
32namespace nnfw
33{
34namespace cker
35{
36
37template <typename T> struct is_quant8
38{
39 static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
40};
41
42template <typename T>
43inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
44{
45 return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
46}
47
48inline void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
49{
50 if (double_multiplier == 0.)
51 {
52 *quantized_multiplier = 0;
53 *shift = 0;
54 return;
55 }
56
57 const double q = std::frexp(double_multiplier, shift);
58 auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
59
60 assert(q_fixed <= (1ll << 31));
61 if (q_fixed == (1ll << 31))
62 {
63 q_fixed /= 2;
64 ++*shift;
65 }
66 assert(q_fixed <= std::numeric_limits<int32_t>::max());
67 // A shift amount smaller than -31 would cause all bits to be shifted out
68 // and thus all results would be zero. We implement that instead with
69 // q_fixed==0, so as to avoid hitting issues with right-shift
70 // operations with shift amounts greater than 31. Note that this happens
71 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
72 // that we're effectively flushing tiny double_multiplier's to zero.
73 // We could conceivably handle values in the range (roughly) [32, 63]
74 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
75 // the present handling is just doing 'flush denormals to zero'. We could
76 // reconsider and actually generate nonzero denormals if a need arises.
77 if (*shift < -31)
78 {
79 *shift = 0;
80 q_fixed = 0;
81 }
82 *quantized_multiplier = static_cast<int32_t>(q_fixed);
83}
84
85inline void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
86 int32_t *quantized_multiplier, int *left_shift)
87{
88 assert(double_multiplier < 1.0);
89 assert(double_multiplier > 0.0);
90 int shift;
91 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
92 assert(shift <= 0);
93 *left_shift = shift;
94}
95
96inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
97{
98 int left_shift = shift > 0 ? shift : 0;
99 int right_shift = shift > 0 ? 0 : -shift;
100 return gemmlowp::RoundingDivideByPOT(
101 gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
102 right_shift);
103}
104
105inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
106 int left_shift)
107{
108 return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
109}
110
112 int32_t quantized_multiplier,
113 int left_shift)
114{
115 return gemmlowp::RoundingDivideByPOT(
116 gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
117}
118
119#ifdef USE_NEON
120inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
121 int32_t quantized_multiplier, int32_t shift)
122{
123 const int left_shift = std::max(shift, 0);
124 const int right_shift = std::min(shift, 0);
125 int32x4x4_t result;
126
127 int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
128 int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
129 int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
130
131 result.val[0] = vrshlq_s32(
132 vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
133
134 result.val[1] = vrshlq_s32(
135 vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
136
137 result.val[2] = vrshlq_s32(
138 vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
139
140 result.val[3] = vrshlq_s32(
141 vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
142
143 return result;
144}
145#endif
146
147inline int NodeOffset(int b, int h, int w, int height, int width)
148{
149 return (b * height + h) * width + w;
150}
151
152inline int CountLeadingZeros(uint32_t integer_input)
153{
154 const uint32_t one_in_leading_positive = 1U << 31;
155 int leading_zeros = 0;
156 while (integer_input < one_in_leading_positive)
157 {
158 integer_input <<= 1;
159 ++leading_zeros;
160 }
161 return leading_zeros;
162}
163
164inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
165 int32_t *output_inv_sqrt, int *output_shift)
166{
167 assert(input >= 0);
168 if (input <= 1)
169 {
170 // Handle the input value 1 separately to avoid overflow in that case
171 // in the general computation below (b/143972021). Also handle 0 as if it
172 // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
173 // but rare/unrealistic input value. We can expect both to occur in some
174 // incompletely trained models, but probably not in fully trained models.
175 *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
176 *output_shift = 0;
177 return;
178 }
179 assert(input > 1);
180 *output_shift = 11;
181 while (input >= (1 << 29))
182 {
183 input /= 4;
184 ++*output_shift;
185 }
186 const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
187 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
188 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
189 *output_shift -= left_shift_bit_pairs;
190 input <<= 2 * left_shift_bit_pairs;
191 assert(input >= (1 << 27));
192 assert(input < (1 << 29));
193 using gemmlowp::FixedPoint;
194 using gemmlowp::Rescale;
195 using gemmlowp::SaturatingRoundingMultiplyByPOT;
196 // Using 3 integer bits gives us enough room for the internal arithmetic in
197 // this Newton-Raphson iteration.
198 using F3 = FixedPoint<int32_t, 3>;
199 using F0 = FixedPoint<int32_t, 0>;
200 const F3 fixedpoint_input = F3::FromRaw(input >> 1);
201 const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
202 const F3 fixedpoint_half_three =
203 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
204 // Newton-Raphson iteration
205 // Naive unoptimized starting guess: x = 1
206 F3 x = F3::One();
207 // Naive unoptimized number of iterations: 5
208 for (int i = 0; i < 5; i++)
209 {
210 const F3 x3 = Rescale<3>(x * x * x);
211 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
212 }
213 const F0 fixedpoint_half_sqrt_2 =
214 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
215 x = x * fixedpoint_half_sqrt_2;
216 *output_inv_sqrt = x.raw();
217 if (*output_shift < 0)
218 {
219 *output_inv_sqrt <<= -*output_shift;
220 *output_shift = 0;
221 }
222 // Convert right shift (right is positive) to left shift.
223 *output_shift *= reverse_shift;
224}
225
226// Comment from tensorflow lite:
227//
228// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
229// BROADCASTING.
230//
231// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
232// rectangular array of numbers.
233//
234// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
235// However, as Dims<N> is to be deprecated, this class exists as an adaptor
236// to enable simple unoptimized implementations of element-wise broadcasting
237// operations.
238template <int N> struct NdArrayDesc
239{
240 // The "extent" of each dimension. Indices along dimension d must be in the
241 // half-open interval [0, extents[d]).
242 int extents[N];
243
244 // The number of *elements* (not bytes) between consecutive indices of each
245 // dimension.
246 int strides[N];
247};
248
249// Comment from tensorflow lite:
250//
251// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
252// BROADCASTING.
253//
254// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
255inline int SubscriptToIndex(const NdArrayDesc<4> &desc, int i0, int i1, int i2, int i3)
256{
257 assert(i0 >= 0 && i0 < desc.extents[0]);
258 assert(i1 >= 0 && i1 < desc.extents[1]);
259 assert(i2 >= 0 && i2 < desc.extents[2]);
260 assert(i3 >= 0 && i3 < desc.extents[3]);
261 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
262}
263
264template <int N> inline int SubscriptToIndexGeneric(const NdArrayDesc<N> *desc, int *iter)
265{
266 int ret_indx = 0;
267 for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
268 {
269 assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
270 ret_indx += iter[idx] * desc->strides[idx];
271 }
272
273 return ret_indx;
274}
275
276// Copies dims to desc, calculating strides.
277template <int N> inline void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc<N> *desc_out)
278{
279 int desc_stride = 1;
280 for (int i = N - 1; i >= 0; --i)
281 {
282 desc_out->extents[i] = input_shape.Dims(i);
283 desc_out->strides[i] = desc_stride;
284 desc_stride *= input_shape.Dims(i);
285 }
286}
287
288template <int N>
289inline void
290NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
291 NdArrayDesc<N> *desc0_out, NdArrayDesc<N> *desc1_out)
292{
293 assert(desc0_out != nullptr);
294 assert(desc1_out != nullptr);
295
296 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
297 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
298
299 // Copy dims to desc, calculating strides.
300 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
301 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
302
303 // Walk over each dimension. If the extents are equal do nothing.
304 // Otherwise, set the desc with extent 1 to have extent equal to the other and
305 // stride 0.
306 for (int i = 0; i < N; ++i)
307 {
308 const int extent0 = extended_input0_shape.Dims(i);
309 const int extent1 = extended_input1_shape.Dims(i);
310 if (extent0 != extent1)
311 {
312 if (extent0 == 1)
313 {
314 desc0_out->strides[i] = 0;
315 desc0_out->extents[i] = extent1;
316 }
317 else
318 {
319 assert(extent1 == 1);
320 desc1_out->strides[i] = 0;
321 desc1_out->extents[i] = extent0;
322 }
323 }
324 }
325}
326
327template <int N>
328inline void
329NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape,
330 const Shape &input2_shape, NdArrayDesc<N> *desc0_out,
331 NdArrayDesc<N> *desc1_out, NdArrayDesc<N> *desc2_out)
332{
333 assert(desc0_out != nullptr);
334 assert(desc1_out != nullptr);
335 assert(desc2_out != nullptr);
336
337 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
338 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
339 auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
340
341 // Copy dims to desc, calculating strides.
342 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
343 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
344 CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
345
346 // Walk over each dimension. If the extents are equal do nothing.
347 // Otherwise, set the desc with extent 1 to have extent equal to the other and
348 // stride 0.
349 for (int i = 0; i < N; ++i)
350 {
351 const int extent0 = extended_input0_shape.Dims(i);
352 const int extent1 = extended_input1_shape.Dims(i);
353 const int extent2 = extended_input2_shape.Dims(i);
354
355 int extent = extent0;
356 if (extent1 != 1)
357 extent = extent1;
358 if (extent2 != 1)
359 extent = extent2;
360
361 assert(extent0 == 1 || extent0 == extent);
362 assert(extent1 == 1 || extent1 == extent);
363 assert(extent2 == 1 || extent2 == extent);
364
365 if (!(extent0 == extent1 && extent1 == extent2))
366 {
367 if (extent0 == 1)
368 {
369 desc0_out->strides[i] = 0;
370 desc0_out->extents[i] = extent;
371 }
372 if (extent1 == 1)
373 {
374 desc1_out->strides[i] = 0;
375 desc1_out->extents[i] = extent;
376 }
377 if (extent2 == 1)
378 {
379 desc2_out->strides[i] = 0;
380 desc2_out->extents[i] = extent;
381 }
382 }
383 }
384}
385
386// Gets next index to iterate through a multidimensional array.
387inline bool NextIndex(const int num_dims, const int *dims, int *current)
388{
389 if (num_dims == 0)
390 {
391 return false;
392 }
393 assert(dims != nullptr);
394 assert(current != nullptr);
395 int carry = 1;
396 for (int idx = num_dims - 1; idx >= 0; --idx)
397 {
398 int current_val = current[idx] + carry;
399 assert(dims[idx] >= current_val);
400 if (dims[idx] == current_val)
401 {
402 current[idx] = 0;
403 }
404 else
405 {
406 current[idx] = current_val;
407 carry = 0;
408 break;
409 }
410 }
411 return (carry == 0);
412}
413
414// Gets offset of index if reducing on axis. When reducing, the flattened offset
415// will not change, if the input index changes on the given axis. For example,
416// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
417// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
418// offset.
419// TODO(kanlig): uses Dims to represent dimensions.
420inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index,
421 const int num_axis, const int *axis)
422{
423 if (num_dims == 0)
424 {
425 return 0;
426 }
427
428 assert(dims != nullptr);
429 assert(index != nullptr);
430
431 size_t offset = 0;
432 for (int idx = 0; idx < num_dims; ++idx)
433 {
434 // if we need to skip this axis
435 bool is_axis = false;
436 if (axis != nullptr)
437 {
438 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
439 {
440 if (idx == axis[axis_idx])
441 {
442 is_axis = true;
443 break;
444 }
445 }
446 }
447 if (!is_axis)
448 {
449 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
450 }
451 }
452 return offset;
453}
454
455template <typename T> void optimized_ops_preload_l1_keep(const T *ptr)
456{
457#ifdef __GNUC__
458 // builtin offered by GCC-compatible compilers including clang
459 __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
460#else
461 (void)ptr;
462#endif
463}
464
465// Writes randomly accessed values from `input` sequentially into `output`.
466template <typename T> class SequentialTensorWriter
467{
468public:
469 SequentialTensorWriter(const T *input_data, T *output_data)
470 : input_data_(input_data), output_ptr_(output_data)
471 {
472 }
473
474 void Write(int position) { *output_ptr_++ = input_data_[position]; }
475 void WriteN(int position, int len)
476 {
477 memcpy(output_ptr_, &input_data_[position], sizeof(T) * len);
478 output_ptr_ += len;
479 }
480
481private:
482 const T *input_data_;
483 T *output_ptr_;
484};
485
486inline std::ostream &operator<<(std::ostream &os, const Shape &shape)
487{
488 using std::begin;
489 using std::end;
490
491 std::string formatted =
492 std::accumulate(begin(shape), end(shape), std::string{"["},
493 [](std::string joined, ShapeIterator::value_type dim) {
494 return std::move(joined).append(std::to_string(dim)).append(",");
495 });
496
497 if (formatted.back() == '[')
498 {
499 formatted.push_back(']');
500 }
501 else
502 {
503 formatted.back() = ']';
504 }
505
506 os << formatted;
507 return os;
508}
509
510} // namespace cker
511} // namespace nnfw
512
513#endif // __NNFW_CKER_UTILS_H__
SequentialTensorWriter(const T *input_data, T *output_data)
Definition Utils.h:469
void WriteN(int position, int len)
Definition Utils.h:475
void Write(int position)
Definition Utils.h:474
int32_t Dims(int i) const
Definition Shape.h:92
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
void CopyDimsToDesc(const Shape &input_shape, NdArrayDesc< N > *desc_out)
Definition Utils.h:277
int SubscriptToIndexGeneric(const NdArrayDesc< N > *desc, int *iter)
Definition Utils.h:264
int NodeOffset(int b, int h, int w, int height, int width)
Definition Utils.h:147
ShapeIterator end(const Shape &s)
void NdArrayDescsForElementwiseBroadcast(const Shape &input0_shape, const Shape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
Definition Utils.h:290
bool NextIndex(const int num_dims, const int *dims, int *current)
Definition Utils.h:387
std::ostream & operator<<(std::ostream &os, const Shape &shape)
Definition Utils.h:486
void QuantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier, int *left_shift)
Definition Utils.h:85
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
Definition Utils.h:420
void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, int32_t *output_inv_sqrt, int *output_shift)
Definition Utils.h:164
T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition Utils.h:43
int CountLeadingZeros(uint32_t integer_input)
Definition Utils.h:152
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.h:48
int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:105
int SubscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
Definition Utils.h:255
void optimized_ops_preload_l1_keep(const T *ptr)
Definition Utils.h:455
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:111
Definition topk_v2.h:30
int32_t begin[5]
Definition Slice.cpp:33
decltype(std::declval< Shape >().Dims(0)) value_type
Definition of this iterator's traits that can be accessed by std::iterator_traits<It>
static constexpr bool value
Definition Utils.h:39