ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PALArithmeticOpCommon.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
19#define ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
20
21#include "PALUtils.h"
23
24#include "core/OMKernelData.h"
25
26namespace onert_micro
27{
28namespace execute
29{
30namespace pal
31{
32
33template <typename T> struct AddFn
34{
35 T operator()(T lhs, T rhs) { return lhs + rhs; }
36};
37template <typename T> struct SubFn
38{
39 T operator()(T lhs, T rhs) { return lhs - rhs; }
40};
41template <typename T> struct MulFn
42{
43 T operator()(T lhs, T rhs) { return lhs * rhs; }
44};
45template <typename T> struct DivFn
46{
47 T operator()(T lhs, T rhs) { return lhs / rhs; }
48};
49template <typename T> struct SquaredDifferenceFn
50{
51 T operator()(T lhs, T rhs) { return (lhs - rhs) * (lhs - rhs); }
52};
53template <typename T, typename Fn>
55 const T *input1_data, const T *input2_data, T *output_data)
56{
57 T activation_min, activation_max;
58 getActivationParams(params, &activation_min, &activation_max);
59
60 Fn func;
61 for (int i = 0; i < flat_size; ++i)
62 output_data[i] =
63 std::min(std::max(func(input1_data[i], input2_data[i]), activation_min), activation_max);
64
65 return Ok;
66}
67
68template <typename T, typename Fn>
70 const core::BinaryArithmeticBroadcastParams &params, const int flat_size,
71 const onert_micro::core::QuantizationParams &input1_qparams, const T *input1_data,
72 const onert_micro::core::QuantizationParams &input2_qparams, const T *input2_data,
73 const onert_micro::core::QuantizationParams &output_qparams, T *output_data)
74{
75 float activation_min, activation_max;
76 getActivationParams(params, &activation_min, &activation_max);
77
78 Fn func;
79 for (int i = 0; i < flat_size; ++i)
80 {
81 // Dequantize input1
82 float input1 = static_cast<float>((input1_data[i] - static_cast<T>(input1_qparams.zero_point)) *
83 input1_qparams.scale);
84 // Dequantize input2
85 float input2 = static_cast<float>((input2_data[i] - static_cast<T>(input2_qparams.zero_point)) *
86 input2_qparams.scale);
87 float result = std::min(std::max(func(input1, input2), activation_min), activation_max);
88
89 // Quantize result
90 result = result / output_qparams.scale + output_qparams.zero_point;
91 result = std::max<float>(std::numeric_limits<T>::min(), result);
92 result = std::min<float>(std::numeric_limits<T>::max(), result);
93 output_data[i] = static_cast<T>(result);
94 }
95
96 return Ok;
97}
98
99template <typename T>
100void ElementWise(const uint32_t size, const core::ArithmeticQuantParams &params,
101 const T *input1_data, const T *input2_data, T *output_data,
102 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
103{
104 for (int i = 0; i < size; ++i)
105 {
106 output_data[i] = binary_func(input1_data[i], input2_data[i], params);
107 }
108}
109
110template <typename T, typename Fn>
112 const int flat_size, const T *input_data, const T scalar_value,
113 T *output_data)
114{
115 T activation_min, activation_max;
116 getActivationParams(params, &activation_min, &activation_max);
117
118 for (int i = 0; i < flat_size; ++i)
119 output_data[i] =
120 std::min(std::max(func(input_data[i], scalar_value), activation_min), activation_max);
121}
122
123template <typename T, typename Fn>
125 const core::OMRuntimeShape &input1_shape, const T *input1_data,
126 const core::OMRuntimeShape &input2_shape, const T *input2_data,
127 const core::OMRuntimeShape &output_shape, T *output_data)
128{
131 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
132 const core::OMRuntimeShape extended_output_shape =
134
135 T activation_min, activation_max;
136 getActivationParams(params, &activation_min, &activation_max);
137
138 // In Tensorflow, the dimensions are canonically named (batch_number, row,
139 // col, channel), with extents (batches, height, width, depth), with the
140 // trailing dimension changing most rapidly (channels has the smallest stride,
141 // typically 1 element).
142 //
143 // In generated C code, we store arrays with the dimensions reversed. The
144 // first dimension has smallest stride.
145 //
146 // We name our variables by their Tensorflow convention, but generate C code
147 // nesting loops such that the innermost loop has the smallest stride for the
148 // best cache behavior.
149 Fn func;
150 for (int b = 0; b < extended_output_shape.dims(0); ++b)
151 {
152 for (int y = 0; y < extended_output_shape.dims(1); ++y)
153 {
154 for (int x = 0; x < extended_output_shape.dims(2); ++x)
155 {
156 for (int c = 0; c < extended_output_shape.dims(3); ++c)
157 {
158 const int output_data_offset =
159 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
160 extended_output_shape.dims(3) +
161 c;
162
163 output_data[output_data_offset] =
164 std::min(std::max(func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
165 input2_data[subscriptToIndex(desc2, b, y, x, c)]),
166 activation_min),
167 activation_max);
168 }
169 }
170 }
171 }
172 return Ok;
173}
174
175template <typename T, typename Fn>
177 const core::BinaryArithmeticBroadcastParams &params, const core::OMRuntimeShape &input1_shape,
178 const onert_micro::core::QuantizationParams &input1_qparams, const T *input1_data,
179 const core::OMRuntimeShape &input2_shape,
180 const onert_micro::core::QuantizationParams &input2_qparams, const T *input2_data,
182 const onert_micro::core::QuantizationParams &output_qparams, T *output_data)
183{
186 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
187 const core::OMRuntimeShape extended_output_shape =
189
190 float activation_min, activation_max;
191 getActivationParams(params, &activation_min, &activation_max);
192
193 // In Tensorflow, the dimensions are canonically named (batch_number, row,
194 // col, channel), with extents (batches, height, width, depth), with the
195 // trailing dimension changing most rapidly (channels has the smallest stride,
196 // typically 1 element).
197 //
198 // In generated C code, we store arrays with the dimensions reversed. The
199 // first dimension has smallest stride.
200 //
201 // We name our variables by their Tensorflow convention, but generate C code
202 // nesting loops such that the innermost loop has the smallest stride for the
203 // best cache behavior.
204 Fn func;
205 for (int b = 0; b < extended_output_shape.dims(0); ++b)
206 {
207 for (int y = 0; y < extended_output_shape.dims(1); ++y)
208 {
209 for (int x = 0; x < extended_output_shape.dims(2); ++x)
210 {
211 for (int c = 0; c < extended_output_shape.dims(3); ++c)
212 {
213 // Dequantize input1
214 float input1 = static_cast<float>((input1_data[subscriptToIndex(desc1, b, y, x, c)] -
215 static_cast<T>(input1_qparams.zero_point)) *
216 input1_qparams.scale);
217 // Dequantize input2
218 float input2 = static_cast<float>((input2_data[subscriptToIndex(desc2, b, y, x, c)] -
219 static_cast<T>(input2_qparams.zero_point)) *
220 input2_qparams.scale);
221
222 float result = std::min(std::max(func(input1, input2), activation_min), activation_max);
223
224 // Quantize result
225 result = result / output_qparams.scale + output_qparams.zero_point;
226 result = std::max<float>(std::numeric_limits<T>::min(), result);
227 result = std::min<float>(std::numeric_limits<T>::max(), result);
228 const int output_data_offset =
229 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
230 extended_output_shape.dims(3) +
231 c;
232 output_data[output_data_offset] = static_cast<T>(result);
233 }
234 }
235 }
236 }
237 return Ok;
238}
239
240template <typename T>
241void BroadcastInput1(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
242 const T *input2_data, T *output_data,
243 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
244{
245 for (int i = 0; i < size; ++i)
246 {
247 output_data[i] = binary_func(input1_data[0], input2_data[i], params);
248 }
249}
250
251template <typename T>
252void BroadcastInput2(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
253 const T *input2_data, T *output_data,
254 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
255{
256 for (int i = 0; i < size; ++i)
257 {
258 output_data[i] = binary_func(input1_data[i], input2_data[0], params);
259 }
260}
261
262template <typename T>
264 size_t *input1_offset_p, size_t *input2_offset_p,
265 size_t *output_offset, size_t *compressed_input1_stride,
266 size_t *compressed_input2_stride, size_t *compressed_output_shape,
267 const T *input1_data, const T *input2_data, T *output_data,
268 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
269{
270 if (dimension > 0)
271 {
272 for (size_t c = 0; c < compressed_output_shape[dimension]; ++c)
273 {
274 size_t input1_offset_c = *input1_offset_p;
275 size_t input2_offset_c = *input2_offset_p;
276 BroadcastRecursiveDimensions(params, dimension - 1, &input1_offset_c, &input2_offset_c,
277 output_offset, compressed_input1_stride,
278 compressed_input2_stride, compressed_output_shape, input1_data,
279 input2_data, output_data, binary_func);
280 *input1_offset_p += compressed_input1_stride[dimension];
281 *input2_offset_p += compressed_input2_stride[dimension];
282 }
283 }
284 else
285 {
286 assert(dimension == 0);
287 bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
288 bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
289 assert(!(input1_is_broadcast && input2_is_broadcast));
290 const T *input1_data_ptr = input1_data + *input1_offset_p;
291 const T *input2_data_ptr = input2_data + *input2_offset_p;
292 T *output_data_ptr = output_data + *output_offset;
293 if (input1_is_broadcast)
294 {
295 // input1 is broadcast.
296 BroadcastInput1<T>(compressed_output_shape[dimension], params, input1_data_ptr,
297 input2_data_ptr, output_data_ptr, binary_func);
298 *input2_offset_p += compressed_output_shape[dimension];
299 }
300 else if (input2_is_broadcast)
301 {
302 // input2 is broadcast.
303 BroadcastInput2<T>(compressed_output_shape[dimension], params, input1_data_ptr,
304 input2_data_ptr, output_data_ptr, binary_func);
305 *input1_offset_p += compressed_output_shape[dimension];
306 }
307 else
308 {
309 // Add element-wise.
310 ElementWise<T>(compressed_output_shape[dimension], params, input1_data_ptr, input2_data_ptr,
311 output_data_ptr, binary_func);
312 *input1_offset_p += compressed_output_shape[dimension];
313 *input2_offset_p += compressed_output_shape[dimension];
314 }
315 *output_offset += compressed_output_shape[dimension];
316 }
317}
318
319template <typename T>
321 const core::OMRuntimeShape &input1_shape, const T *input1_data,
322 const core::OMRuntimeShape &input2_shape, const T *input2_data,
323 const core::OMRuntimeShape &output_shape, T *output_data,
324 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
325{
326 constexpr int kMaxBroadcastDim = 6;
327
328 // In Tensorflow, the dimensions are canonically named (batch_number, row,
329 // col, channel), with extents (batches, height, width, depth), with the
330 // trailing dimension changing most rapidly (channels has the smallest stride,
331 // typically 1 element).
332 //
333 // In generated C code, we store arrays with the dimensions reversed. The
334 // first dimension has smallest stride.
335 //
336 // We name our variables by their Tensorflow convention, but generate C code
337 // nesting loops such that the innermost loop has the smallest stride for the
338 // best cache behavior.
339 size_t compressed_input1_stride[kMaxBroadcastDim];
340 size_t compressed_input2_stride[kMaxBroadcastDim];
341 size_t compressed_output_shape[kMaxBroadcastDim];
342 bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
343 input1_shape, input2_shape, compressed_input1_stride, compressed_input2_stride,
344 compressed_output_shape);
345 // Skip broadcasting for degenerate shapes.
346 if (!broadcastable_shape)
347 {
348 return;
349 }
350
351 size_t input1_offset = 0;
352 size_t input2_offset = 0;
353 size_t output_offset = 0;
354 BroadcastRecursiveDimensions(params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
355 &output_offset, compressed_input1_stride, compressed_input2_stride,
356 compressed_output_shape, input1_data, input2_data, output_data,
357 binary_func);
358}
359
360} // namespace pal
361} // namespace execute
362} // namespace onert_micro
363
364#endif // ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
static OMRuntimeShape extendedShape(size_t new_shape_size, const OMRuntimeShape &shape)
NdArrayDesc< 4 > desc1
const luci_interpreter::RuntimeShape output_shape
NdArrayDesc< 4 > desc2
void BroadcastInput2(int size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
OMStatus ArithmeticOp(const core::BinaryArithmeticBroadcastParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
void getActivationParams(const P &params, int32_t *min, int32_t *max)
Definition PALUtils.h:120
void BroadcastInput1(int size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void NdArrayDescsForElementwiseBroadcast(const core::OMRuntimeShape &input0_shape, const core::OMRuntimeShape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
OMStatus BroadcastArithmeticOp4DSlow(const core::BinaryArithmeticBroadcastParams &params, const core::OMRuntimeShape &input1_shape, const T *input1_data, const core::OMRuntimeShape &input2_shape, const T *input2_data, const core::OMRuntimeShape &output_shape, T *output_data)
OMStatus QuantizedBroadcastArithmeticOp4DSlow(const core::BinaryArithmeticBroadcastParams &params, const core::OMRuntimeShape &input1_shape, const onert_micro::core::QuantizationParams &input1_qparams, const T *input1_data, const core::OMRuntimeShape &input2_shape, const onert_micro::core::QuantizationParams &input2_qparams, const T *input2_data, const core::OMRuntimeShape &output_shape, const onert_micro::core::QuantizationParams &output_qparams, T *output_data)
OMStatus QuantizedArithmeticOp(const core::BinaryArithmeticBroadcastParams &params, const int flat_size, const onert_micro::core::QuantizationParams &input1_qparams, const T *input1_data, const onert_micro::core::QuantizationParams &input2_qparams, const T *input2_data, const onert_micro::core::QuantizationParams &output_qparams, T *output_data)
void ArithmeticOpScalar(const core::BinaryArithmeticBroadcastParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
int subscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
void BroadcastBinaryFunction6DSlow(const core::ArithmeticQuantParams &params, const core::OMRuntimeShape &input1_shape, const T *input1_data, const core::OMRuntimeShape &input2_shape, const T *input2_data, const core::OMRuntimeShape &output_shape, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void ElementWise(const uint32_t size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void BroadcastRecursiveDimensions(const core::ArithmeticQuantParams &params, int dimension, size_t *input1_offset_p, size_t *input2_offset_p, size_t *output_offset, size_t *compressed_input1_stride, size_t *compressed_input2_stride, size_t *compressed_output_shape, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
int32_t size[5]
Definition Slice.cpp:35