ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PALUtils.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef ONERT_MICRO_EXECUTE_PAL_UTILS_H
19#define ONERT_MICRO_EXECUTE_PAL_UTILS_H
20
21#include <cassert>
22
23namespace onert_micro
24{
25namespace execute
26{
27namespace pal
28{
29
30inline std::pair<uint32_t, uint32_t> getUpLowerWeightTensorDepth(core::OpTrainableRankType rank,
31 const uint32_t output_depth)
32{
33 std::pair<uint32_t, uint32_t> result(0u, output_depth);
34
35 switch (rank)
36 {
37 case core::ALL:
38 break;
40 result.second = static_cast<uint32_t>(static_cast<float>(output_depth) / 2.f);
41 break;
43 result.first = static_cast<uint32_t>(static_cast<float>(output_depth) / 2.f);
44 break;
45 default:
46 assert("Unsupported type");
47 break;
48 }
49
50 return result;
51}
52
53// Table of sigmoid(i/24) at 0.16 format - 256 elements.
54// We use combined sigmoid and tanh look-up table, since
55// tanh(x) = 2*sigmoid(2*x) -1.
56// Both functions are symmetric, so the LUT table is only needed
57// for the absolute value of the input.
58static const uint16_t sigmoid_table_uint16[256] = {
59 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, 38180, 38841, 39498, 40149, 40794, 41432,
60 42064, 42688, 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, 47911, 48443, 48964, 49475,
61 49975, 50464, 50942, 51409, 51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, 55125, 55485,
62 55834, 56174, 56503, 56823, 57133, 57433, 57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519,
63 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, 61279, 61441, 61599, 61750, 61896, 62036,
64 62172, 62302, 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186, 63279, 63368, 63454, 63536,
65 63615, 63691, 63765, 63835, 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, 64357, 64405,
66 64450, 64494, 64536, 64576, 64614, 64652, 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
67 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, 65097, 65115, 65132, 65149, 65164, 65179,
68 65194, 65208, 65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, 65310, 65319, 65328, 65337,
69 65345, 65352, 65360, 65367, 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, 65420, 65425,
70 65429, 65433, 65438, 65442, 65445, 65449, 65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474,
71 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, 65493, 65495, 65497, 65498, 65500, 65501,
72 65503, 65504, 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513, 65514, 65515, 65516, 65517,
73 65517, 65518, 65519, 65520, 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, 65525, 65525,
74 65526, 65526, 65526, 65527, 65527, 65528, 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
75 65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, 65532, 65532, 65532, 65532, 65532, 65533,
76 65533, 65533, 65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, 65534, 65534, 65534, 65534,
77 65534, 65534, 65534, 65535};
78
79inline std::int32_t saturatingRoundingDoublingHighMul(std::int32_t a, std::int32_t b)
80{
81 bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
82 std::int64_t a_64(a);
83 std::int64_t b_64(b);
84 std::int64_t ab_64 = a_64 * b_64;
85 std::int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
86 std::int32_t ab_x2_high32 = static_cast<std::int32_t>((ab_64 + nudge) / (1ll << 31));
87 return overflow ? std::numeric_limits<std::int32_t>::max() : ab_x2_high32;
88}
89
90// Correctly-rounded-to-nearest division by a power-of-two.
91// Also known as a rounding arithmetic right shift.
92inline int32_t roundingDivideByPOT(int32_t x, int32_t exponent)
93{
94 assert(exponent >= 0);
95 assert(exponent <= 31);
96 const int32_t mask = int32_t((1ll << exponent) - 1);
97 const int32_t zero = int32_t(0);
98 const int32_t one = int32_t(1);
99 const int32_t remainder = x & mask;
100 const int32_t threshold = (mask >> 1) + ((x < zero ? one : zero) & one);
101 return (x >> exponent) + ((remainder > threshold ? one : zero) & one);
102}
103
104inline int32_t multiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
105{
106 int left_shift = shift > 0 ? shift : 0;
107 int right_shift = shift > 0 ? 0 : -shift;
108 return roundingDivideByPOT(
109 saturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
110}
111
113 int32_t quantized_multiplier,
114 int left_shift)
115{
116 return roundingDivideByPOT(saturatingRoundingDoublingHighMul(x, quantized_multiplier),
117 -left_shift);
118}
119
120template <typename P> inline void getActivationParams(const P &params, int32_t *min, int32_t *max)
121{
122 *min = params.int32_activation_min;
123 *max = params.int32_activation_max;
124}
125
126template <typename P> inline void getActivationParams(const P &params, float *min, float *max)
127{
128 *min = params.float_activation_min;
129 *max = params.float_activation_max;
130}
131
132template <typename P> inline void getActivationParams(const P &params, int64_t *min, int64_t *max)
133{
134 *min = params.int64_activation_min;
135 *max = params.int64_activation_max;
136}
137
138// Gets offset of index if reducing on axis. When reducing, the flattened offset
139// will not change, if the input index changes on the given axis. For example,
140// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
141// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
142// offset.
143inline size_t reducedOutputOffset(const int32_t num_dims, const int32_t *dims, const int32_t *index,
144 const int32_t num_axis, const int32_t *axis)
145{
146 if (num_dims == 0)
147 {
148 return 0;
149 }
150 size_t offset = 0;
151 for (int idx = 0; idx < num_dims; ++idx)
152 {
153 // if we need to skip this axis
154 bool is_axis = false;
155 if (axis != nullptr)
156 {
157 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
158 {
159 if (idx == axis[axis_idx])
160 {
161 is_axis = true;
162 break;
163 }
164 }
165 }
166 if (!is_axis)
167 {
168 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
169 }
170 }
171 return offset;
172}
173
174// Gets next index to iterate through a multidimensional array.
175inline bool nextIndex(const int32_t num_dims, const int32_t *dims, int32_t *current)
176{
177 if (num_dims == 0)
178 {
179 return false;
180 }
181 int carry = 1;
182 for (int idx = num_dims - 1; idx >= 0; --idx)
183 {
184 int current_val = current[idx] + carry;
185 if (dims[idx] == current_val)
186 {
187 current[idx] = 0;
188 }
189 else
190 {
191 current[idx] = current_val;
192 carry = 0;
193 break;
194 }
195 }
196 return (carry == 0);
197}
198
199// Get common shape dim, assert that they all agree.
200inline int MatchingDim(const core::OMRuntimeShape &shape1, int index1,
201 const core::OMRuntimeShape &shape2, int index2)
202{
203 assert(shape1.dims(index1) == shape2.dims(index2));
204 return shape1.dims(index1);
205}
206
207// Data is required to be contiguous, and so many operators can use either the
208// full array flat size or the flat size with one dimension skipped (commonly
209// the depth).
210inline int flatSizeSkipDim(const int32_t *dims_data, int skip_dim, int num_dims)
211{
212 int flat_size = 1;
213 for (int i = 0; i < num_dims; ++i)
214 {
215 flat_size *= (i == skip_dim) ? 1 : dims_data[i];
216 }
217 return flat_size;
218}
219
220inline int offset(const int32_t *dims_data, int i0, int i1, int i2, int i3)
221{
222 return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
223}
224
225inline int offset(const int32_t *dims_data, int i0, int i1, int i2, int i3, int i4)
226{
227 return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) * dims_data[4] + i4;
228}
229
230template <typename T>
231inline T activationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
232{
233 using std::max;
234 using std::min;
235 return min(max(x, output_activation_min), output_activation_max);
236}
237
238// Reduces and compresses dimensions so that broadcast handling becomes more
239// efficient. Returns true if the output shape is broadcastable; it doesn't
240// contain any degenerate dimension, i.e. shape dimension = 0. False otherwise.
241template <int MAX_DIM = 6>
243 const core::OMRuntimeShape &input2_shape,
244 size_t *compressed_input1_stride,
245 size_t *compressed_input2_stride, size_t *compressed_output_shape)
246{
247 size_t num_compressed_dims = 0;
248 size_t compressed_input1_shape[MAX_DIM];
249 size_t compressed_input2_shape[MAX_DIM];
250 std::fill(compressed_input1_shape, compressed_input1_shape + MAX_DIM, 1);
251 std::fill(compressed_input2_shape, compressed_input2_shape + MAX_DIM, 1);
252 std::fill(compressed_output_shape, compressed_output_shape + MAX_DIM, 1);
253 bool broadcast_input1 = false;
254 bool broadcast_input2 = false;
255 bool first_nonunit = true;
256
257 if (input1_shape.dimensionsCount() < 0 || input2_shape.dimensionsCount() < 0)
258 {
259 return false;
260 }
261 const size_t num_input1_dims = input1_shape.dimensionsCount();
262 const size_t num_input2_dims = input2_shape.dimensionsCount();
263 const int32_t *input1_dims = input1_shape.dimsData();
264 const int32_t *input2_dims = input2_shape.dimsData();
265 const size_t num_common_dims = std::min(num_input1_dims, num_input2_dims);
266 for (size_t i = 1; i <= num_common_dims; i++)
267 {
268 if (input1_dims[num_input1_dims - i] < 0 || input2_dims[num_input2_dims - i] < 0)
269 {
270 return false;
271 }
272 const size_t input1_dim = input1_dims[num_input1_dims - i];
273 const size_t input2_dim = input2_dims[num_input2_dims - i];
274 if (input1_dim == 0 || input2_dim == 0)
275 {
276 return false;
277 }
278 if (input1_dim == 1 && input2_dim == 1)
279 {
280 continue;
281 }
282 assert(!broadcast_input1 || !broadcast_input2);
283
284 if (input1_dim == 1)
285 {
286 if (!broadcast_input1)
287 {
288 broadcast_input1 = true;
289 broadcast_input2 = false;
290 num_compressed_dims++;
291 }
292 compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
293 compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
294 }
295 else if (input2_dim == 1)
296 {
297 if (!broadcast_input2)
298 {
299 broadcast_input1 = false;
300 broadcast_input2 = true;
301 num_compressed_dims++;
302 }
303 compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
304 compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
305 }
306 else
307 {
308 assert(input1_dim == input2_dim);
309 if (broadcast_input1 || broadcast_input2 || first_nonunit)
310 {
311 broadcast_input1 = false;
312 broadcast_input2 = false;
313 num_compressed_dims++;
314 }
315 compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
316 compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
317 compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
318 }
319 first_nonunit = false;
320 }
321 if (num_input1_dims > num_input2_dims)
322 {
323 if (!broadcast_input2)
324 {
325 num_compressed_dims++;
326 }
327 for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++)
328 {
329 if (input1_dims[i] < 0)
330 return false;
331 const size_t input1_dim = input1_dims[i];
332 if (input1_dim == 0)
333 {
334 return false;
335 }
336 compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
337 compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
338 }
339 }
340 else if (num_input2_dims > num_input1_dims)
341 {
342 if (!broadcast_input1)
343 {
344 num_compressed_dims++;
345 }
346 for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++)
347 {
348 if (input2_dims[i] < 0)
349 return false;
350 const size_t input2_dim = input2_dims[i];
351 if (input2_dim == 0)
352 {
353 return false;
354 }
355 compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
356 compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
357 }
358 }
359 num_compressed_dims = (num_compressed_dims > 1) ? num_compressed_dims : 1;
360
361 int input1_stride = 1;
362 int input2_stride = 1;
363 for (int i = 0; i < MAX_DIM; ++i)
364 {
365 compressed_input1_stride[i] = input1_stride;
366 input1_stride *= compressed_input1_shape[i];
367 compressed_input2_stride[i] = input2_stride;
368 input2_stride *= compressed_input2_shape[i];
369 }
370 for (int i = 0; i < MAX_DIM; ++i)
371 {
372 if (compressed_input1_shape[i] != compressed_input2_shape[i])
373 {
374 if (compressed_input1_shape[i] == 1)
375 {
376 compressed_input1_stride[i] = 0;
377 }
378 else
379 {
380 assert(compressed_input2_shape[i] == 1);
381 compressed_input2_stride[i] = 0;
382 }
383 }
384 }
385 return true;
386}
387
388} // namespace pal
389} // namespace execute
390} // namespace onert_micro
391
392#endif // ONERT_MICRO_EXECUTE_PAL_UTILS_H
std::pair< uint32_t, uint32_t > getUpLowerWeightTensorDepth(core::OpTrainableRankType rank, const uint32_t output_depth)
Definition PALUtils.h:30
int flatSizeSkipDim(const int32_t *dims_data, int skip_dim, int num_dims)
Definition PALUtils.h:210
std::int32_t saturatingRoundingDoublingHighMul(std::int32_t a, std::int32_t b)
Definition PALUtils.h:79
bool nextIndex(const int32_t num_dims, const int32_t *dims, int32_t *current)
Definition PALUtils.h:175
void getActivationParams(const P &params, int32_t *min, int32_t *max)
Definition PALUtils.h:120
int32_t multiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition PALUtils.h:112
bool ReduceDimensionsForBroadcast(const core::OMRuntimeShape &input1_shape, const core::OMRuntimeShape &input2_shape, size_t *compressed_input1_stride, size_t *compressed_input2_stride, size_t *compressed_output_shape)
Definition PALUtils.h:242
int MatchingDim(const core::OMRuntimeShape &shape1, int index1, const core::OMRuntimeShape &shape2, int index2)
Definition PALUtils.h:200
size_t reducedOutputOffset(const int32_t num_dims, const int32_t *dims, const int32_t *index, const int32_t num_axis, const int32_t *axis)
Definition PALUtils.h:143
int offset(const int32_t *dims_data, int i0, int i1, int i2, int i3)
Definition PALUtils.h:220
int32_t multiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition PALUtils.h:104
int32_t roundingDivideByPOT(int32_t x, int32_t exponent)
Definition PALUtils.h:92
T activationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition PALUtils.h:231