ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PALArithmeticOpCommon.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef LUCI_INTERPRETER_PAL_ARITHMETICOPCOMMON_H
19#define LUCI_INTERPRETER_PAL_ARITHMETICOPCOMMON_H
20
21#include "Params.h"
22#include "PALUtils.h"
24
26{
27
28template <typename T> struct AddFn
29{
30 T operator()(T lhs, T rhs) { return lhs + rhs; }
31};
32template <typename T> struct SubFn
33{
34 T operator()(T lhs, T rhs) { return lhs - rhs; }
35};
36template <typename T> struct MulFn
37{
38 T operator()(T lhs, T rhs) { return lhs * rhs; }
39};
40template <typename T> struct DivFn
41{
42 T operator()(T lhs, T rhs) { return lhs / rhs; }
43};
44
45// TODO: check if there real activation value
46template <typename T, typename Fn>
47inline void ArithmeticOp(const ArithmeticParams &params, const int flat_size, const T *input1_data,
48 const T *input2_data, T *output_data)
49{
50 T activation_min, activation_max;
51 getActivationParams(params, &activation_min, &activation_max);
52
53 Fn func;
54 for (int i = 0; i < flat_size; ++i)
55 output_data[i] =
56 std::min(std::max(func(input1_data[i], input2_data[i]), activation_min), activation_max);
57}
58
59template <typename T, typename Fn>
60inline void ArithmeticOpScalar(const ArithmeticParams &params, const int flat_size,
61 const T *input_data, const T scalar_value, T *output_data)
62{
63 T activation_min, activation_max;
64 getActivationParams(params, &activation_min, &activation_max);
65
66 for (int i = 0; i < flat_size; ++i)
67 output_data[i] =
68 std::min(std::max(func(input_data[i], scalar_value), activation_min), activation_max);
69}
70
71template <typename T, typename Fn>
73 const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape,
74 const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
75 const luci_interpreter::RuntimeShape &output_shape, T *output_data)
76{
79 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
80 const luci_interpreter::RuntimeShape extended_output_shape =
82
83 T activation_min, activation_max;
84 getActivationParams(params, &activation_min, &activation_max);
85
86 // In Tensorflow, the dimensions are canonically named (batch_number, row,
87 // col, channel), with extents (batches, height, width, depth), with the
88 // trailing dimension changing most rapidly (channels has the smallest stride,
89 // typically 1 element).
90 //
91 // In generated C code, we store arrays with the dimensions reversed. The
92 // first dimension has smallest stride.
93 //
94 // We name our variables by their Tensorflow convention, but generate C code
95 // nesting loops such that the innermost loop has the smallest stride for the
96 // best cache behavior.
97 Fn func;
98 for (int b = 0; b < extended_output_shape.dims(0); ++b)
99 {
100 for (int y = 0; y < extended_output_shape.dims(1); ++y)
101 {
102 for (int x = 0; x < extended_output_shape.dims(2); ++x)
103 {
104 for (int c = 0; c < extended_output_shape.dims(3); ++c)
105 {
106 const int output_data_offset =
107 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
108 extended_output_shape.dims(3) +
109 c;
110
111 output_data[output_data_offset] =
112 std::min(std::max(func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
113 input2_data[subscriptToIndex(desc2, b, y, x, c)]),
114 activation_min),
115 activation_max);
116 }
117 }
118 }
119 }
120}
121
122} // namespace luci_interpreter_pal
123
124#endif // LUCI_INTERPRETER_PAL_ARITHMETICOPCOMMON_H
int32_t dims(int i) const
Definition Tensor.h:108
static RuntimeShape extendedShape(int new_shape_size, const RuntimeShape &shape)
Definition Tensor.h:95
NdArrayDesc< 4 > desc1
const luci_interpreter::RuntimeShape output_shape
NdArrayDesc< 4 > desc2
void ArithmeticOp(const ArithmeticParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
int subscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
void getActivationParams(const P &params, int32_t *min, int32_t *max)
Definition PALUtils.h:93
void NdArrayDescsForElementwiseBroadcast(const luci_interpreter::RuntimeShape &input0_shape, const luci_interpreter::RuntimeShape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
void ArithmeticOpScalar(const ArithmeticParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
void BroadcastArithmeticOp4DSlow(const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)