ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PALArithmeticOpCommon.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
19#define ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
20
21#include "PALUtils.h"
23
24#include "core/OMKernelData.h"
25
26namespace onert_micro
27{
28namespace execute
29{
30namespace pal
31{
32
33template <typename T> struct AddFn
34{
35 T operator()(T lhs, T rhs) { return lhs + rhs; }
36};
37template <typename T> struct SubFn
38{
39 T operator()(T lhs, T rhs) { return lhs - rhs; }
40};
41template <typename T> struct MulFn
42{
43 T operator()(T lhs, T rhs) { return lhs * rhs; }
44};
45template <typename T> struct DivFn
46{
47 T operator()(T lhs, T rhs) { return lhs / rhs; }
48};
49template <typename T> struct SquaredDifferenceFn
50{
51 T operator()(T lhs, T rhs) { return (lhs - rhs) * (lhs - rhs); }
52};
53template <typename T, typename Fn>
55 const T *input1_data, const T *input2_data, T *output_data)
56{
57 T activation_min, activation_max;
58 getActivationParams(params, &activation_min, &activation_max);
59
60 Fn func;
61 for (int i = 0; i < flat_size; ++i)
62 output_data[i] =
63 std::min(std::max(func(input1_data[i], input2_data[i]), activation_min), activation_max);
64
65 return Ok;
66}
67
68template <typename T>
69void ElementWise(const uint32_t size, const core::ArithmeticQuantParams &params,
70 const T *input1_data, const T *input2_data, T *output_data,
71 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
72{
73 for (int i = 0; i < size; ++i)
74 {
75 output_data[i] = binary_func(input1_data[i], input2_data[i], params);
76 }
77}
78
79template <typename T, typename Fn>
81 const int flat_size, const T *input_data, const T scalar_value,
82 T *output_data)
83{
84 T activation_min, activation_max;
85 getActivationParams(params, &activation_min, &activation_max);
86
87 for (int i = 0; i < flat_size; ++i)
88 output_data[i] =
89 std::min(std::max(func(input_data[i], scalar_value), activation_min), activation_max);
90}
91
92template <typename T, typename Fn>
94 const core::OMRuntimeShape &input1_shape, const T *input1_data,
95 const core::OMRuntimeShape &input2_shape, const T *input2_data,
96 const core::OMRuntimeShape &output_shape, T *output_data)
97{
100 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
101 const core::OMRuntimeShape extended_output_shape =
103
104 T activation_min, activation_max;
105 getActivationParams(params, &activation_min, &activation_max);
106
107 // In Tensorflow, the dimensions are canonically named (batch_number, row,
108 // col, channel), with extents (batches, height, width, depth), with the
109 // trailing dimension changing most rapidly (channels has the smallest stride,
110 // typically 1 element).
111 //
112 // In generated C code, we store arrays with the dimensions reversed. The
113 // first dimension has smallest stride.
114 //
115 // We name our variables by their Tensorflow convention, but generate C code
116 // nesting loops such that the innermost loop has the smallest stride for the
117 // best cache behavior.
118 Fn func;
119 for (int b = 0; b < extended_output_shape.dims(0); ++b)
120 {
121 for (int y = 0; y < extended_output_shape.dims(1); ++y)
122 {
123 for (int x = 0; x < extended_output_shape.dims(2); ++x)
124 {
125 for (int c = 0; c < extended_output_shape.dims(3); ++c)
126 {
127 const int output_data_offset =
128 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
129 extended_output_shape.dims(3) +
130 c;
131
132 output_data[output_data_offset] =
133 std::min(std::max(func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
134 input2_data[subscriptToIndex(desc2, b, y, x, c)]),
135 activation_min),
136 activation_max);
137 }
138 }
139 }
140 }
141 return Ok;
142}
143
144template <typename T>
145void BroadcastInput1(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
146 const T *input2_data, T *output_data,
147 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
148{
149 for (int i = 0; i < size; ++i)
150 {
151 output_data[i] = binary_func(input1_data[0], input2_data[i], params);
152 }
153}
154
155template <typename T>
156void BroadcastInput2(int size, const core::ArithmeticQuantParams &params, const T *input1_data,
157 const T *input2_data, T *output_data,
158 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
159{
160 for (int i = 0; i < size; ++i)
161 {
162 output_data[i] = binary_func(input1_data[i], input2_data[0], params);
163 }
164}
165
166template <typename T>
168 size_t *input1_offset_p, size_t *input2_offset_p,
169 size_t *output_offset, size_t *compressed_input1_stride,
170 size_t *compressed_input2_stride, size_t *compressed_output_shape,
171 const T *input1_data, const T *input2_data, T *output_data,
172 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
173{
174 if (dimension > 0)
175 {
176 for (size_t c = 0; c < compressed_output_shape[dimension]; ++c)
177 {
178 size_t input1_offset_c = *input1_offset_p;
179 size_t input2_offset_c = *input2_offset_p;
180 BroadcastRecursiveDimensions(params, dimension - 1, &input1_offset_c, &input2_offset_c,
181 output_offset, compressed_input1_stride,
182 compressed_input2_stride, compressed_output_shape, input1_data,
183 input2_data, output_data, binary_func);
184 *input1_offset_p += compressed_input1_stride[dimension];
185 *input2_offset_p += compressed_input2_stride[dimension];
186 }
187 }
188 else
189 {
190 assert(dimension == 0);
191 bool input1_is_broadcast = compressed_input1_stride[dimension] == 0;
192 bool input2_is_broadcast = compressed_input2_stride[dimension] == 0;
193 assert(!(input1_is_broadcast && input2_is_broadcast));
194 const T *input1_data_ptr = input1_data + *input1_offset_p;
195 const T *input2_data_ptr = input2_data + *input2_offset_p;
196 T *output_data_ptr = output_data + *output_offset;
197 if (input1_is_broadcast)
198 {
199 // input1 is broadcast.
200 BroadcastInput1<T>(compressed_output_shape[dimension], params, input1_data_ptr,
201 input2_data_ptr, output_data_ptr, binary_func);
202 *input2_offset_p += compressed_output_shape[dimension];
203 }
204 else if (input2_is_broadcast)
205 {
206 // input2 is broadcast.
207 BroadcastInput2<T>(compressed_output_shape[dimension], params, input1_data_ptr,
208 input2_data_ptr, output_data_ptr, binary_func);
209 *input1_offset_p += compressed_output_shape[dimension];
210 }
211 else
212 {
213 // Add element-wise.
214 ElementWise<T>(compressed_output_shape[dimension], params, input1_data_ptr, input2_data_ptr,
215 output_data_ptr, binary_func);
216 *input1_offset_p += compressed_output_shape[dimension];
217 *input2_offset_p += compressed_output_shape[dimension];
218 }
219 *output_offset += compressed_output_shape[dimension];
220 }
221}
222
223template <typename T>
225 const core::OMRuntimeShape &input1_shape, const T *input1_data,
226 const core::OMRuntimeShape &input2_shape, const T *input2_data,
227 const core::OMRuntimeShape &output_shape, T *output_data,
228 T (*binary_func)(T, T, const core::ArithmeticQuantParams &))
229{
230 constexpr int kMaxBroadcastDim = 6;
231
232 // In Tensorflow, the dimensions are canonically named (batch_number, row,
233 // col, channel), with extents (batches, height, width, depth), with the
234 // trailing dimension changing most rapidly (channels has the smallest stride,
235 // typically 1 element).
236 //
237 // In generated C code, we store arrays with the dimensions reversed. The
238 // first dimension has smallest stride.
239 //
240 // We name our variables by their Tensorflow convention, but generate C code
241 // nesting loops such that the innermost loop has the smallest stride for the
242 // best cache behavior.
243 size_t compressed_input1_stride[kMaxBroadcastDim];
244 size_t compressed_input2_stride[kMaxBroadcastDim];
245 size_t compressed_output_shape[kMaxBroadcastDim];
246 bool broadcastable_shape = ReduceDimensionsForBroadcast<kMaxBroadcastDim>(
247 input1_shape, input2_shape, compressed_input1_stride, compressed_input2_stride,
248 compressed_output_shape);
249 // Skip broadcasting for degenerate shapes.
250 if (!broadcastable_shape)
251 {
252 return;
253 }
254
255 size_t input1_offset = 0;
256 size_t input2_offset = 0;
257 size_t output_offset = 0;
258 BroadcastRecursiveDimensions(params, kMaxBroadcastDim - 1, &input1_offset, &input2_offset,
259 &output_offset, compressed_input1_stride, compressed_input2_stride,
260 compressed_output_shape, input1_data, input2_data, output_data,
261 binary_func);
262}
263
264} // namespace pal
265} // namespace execute
266} // namespace onert_micro
267
268#endif // ONERT_MICRO_EXECUTE_PAL_ARITHMETIC_OP_COMMON_H
static OMRuntimeShape extendedShape(int new_shape_size, const OMRuntimeShape &shape)
NdArrayDesc< 4 > desc1
const luci_interpreter::RuntimeShape output_shape
NdArrayDesc< 4 > desc2
void BroadcastInput2(int size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
OMStatus ArithmeticOp(const core::BinaryArithmeticBroadcastParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
void getActivationParams(const P &params, int32_t *min, int32_t *max)
Definition PALUtils.h:120
void BroadcastInput1(int size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void NdArrayDescsForElementwiseBroadcast(const core::OMRuntimeShape &input0_shape, const core::OMRuntimeShape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
OMStatus BroadcastArithmeticOp4DSlow(const core::BinaryArithmeticBroadcastParams &params, const core::OMRuntimeShape &input1_shape, const T *input1_data, const core::OMRuntimeShape &input2_shape, const T *input2_data, const core::OMRuntimeShape &output_shape, T *output_data)
void ArithmeticOpScalar(const core::BinaryArithmeticBroadcastParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
int subscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
void BroadcastBinaryFunction6DSlow(const core::ArithmeticQuantParams &params, const core::OMRuntimeShape &input1_shape, const T *input1_data, const core::OMRuntimeShape &input2_shape, const T *input2_data, const core::OMRuntimeShape &output_shape, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void ElementWise(const uint32_t size, const core::ArithmeticQuantParams &params, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
void BroadcastRecursiveDimensions(const core::ArithmeticQuantParams &params, int dimension, size_t *input1_offset_p, size_t *input2_offset_p, size_t *output_offset, size_t *compressed_input1_stride, size_t *compressed_input2_stride, size_t *compressed_output_shape, const T *input1_data, const T *input2_data, T *output_data, T(*binary_func)(T, T, const core::ArithmeticQuantParams &))
int32_t size[5]
Definition Slice.cpp:35