ONE - On-device Neural Engine
Loading...
Searching...
No Matches
Add.cpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include "kernels/Add.h"
19
20#include "kernels/BinaryOpCommon.h"
21#include "kernels/Utils.h"
22
23#include <tensorflow/lite/kernels/internal/reference/add.h>
24#include <tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h>
25
26#include <stdexcept>
27
28namespace luci_interpreter
29{
30namespace kernels
31{
32
33Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams &params)
34 : KernelWithParams<AddParams>({input1, input2}, {output}, params)
35{
36}
37
39{
40 LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type());
41 LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type());
42 if (input1()->element_type() == DataType::S16)
43 {
44 LUCI_INTERPRETER_CHECK(input1()->zero_points().size() == 1 &&
45 input2()->zero_points().size() == 1);
46 LUCI_INTERPRETER_CHECK(input1()->zero_point() == 0 && input2()->zero_point() == 0 &&
47 output()->zero_point() == 0);
48 }
49
50 output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
51}
52
53void Add::execute() const
54{
55 switch (input1()->element_type())
56 {
57 case DataType::FLOAT32:
58 evalFloat();
59 break;
60 case DataType::S64:
61 evalInteger<int64_t>();
62 break;
63 case DataType::S32:
64 evalInteger<int32_t>();
65 break;
66 case DataType::U8:
67 evalQuantized();
68 break;
69 case DataType::S16:
70 evalQuantizedS16();
71 break;
72 default:
73 throw std::runtime_error("luci-intp Add Unsupported type.");
74 }
75}
76
77void Add::evalFloat() const
78{
79 tflite::ArithmeticParams params{};
80 fillArithmeticActivationRange<float>(params, _params.activation);
81
82 const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
84
85 if (need_broadcast)
86 {
87 tflite::reference_ops::BroadcastAdd4DSlow(
88 params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
89 getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
90 }
91 else
92 {
93 tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<float>(input1()),
94 getTensorShape(input2()), getTensorData<float>(input2()),
95 getTensorShape(output()), getTensorData<float>(output()));
96 }
97}
98
99template <typename T> void Add::evalInteger() const
100{
101 tflite::ArithmeticParams params{};
102 fillArithmeticActivationRange<T>(params, _params.activation);
103
104 const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
106
107 if (need_broadcast)
108 {
109 tflite::reference_ops::BroadcastAdd4DSlow(
110 params, getTensorShape(input1()), getTensorData<T>(input1()), getTensorShape(input2()),
111 getTensorData<T>(input2()), getTensorShape(output()), getTensorData<T>(output()));
112 }
113 else
114 {
115 tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<T>(input1()),
116 getTensorShape(input2()), getTensorData<T>(input2()),
117 getTensorShape(output()), getTensorData<T>(output()));
118 }
119}
120
121void Add::evalQuantized() const
122{
123 const auto input1_scale = static_cast<double>(input1()->scale());
124 const auto input2_scale = static_cast<double>(input2()->scale());
125 const auto output_scale = static_cast<double>(output()->scale());
126
127 const int left_shift = 20;
128 const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
129 const double real_input1_multiplier = input1_scale / twice_max_input_scale;
130 const double real_input2_multiplier = input2_scale / twice_max_input_scale;
131 const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
132
133 int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
134 int input1_shift{}, input2_shift{}, output_shift{};
135 quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
136 quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
137 quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
138
139 int32_t activation_min{};
140 int32_t activation_max{};
141 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
142
143 tflite::ArithmeticParams params{};
144 params.left_shift = left_shift;
145 // The kernel expects inputs' zero points to be negated.
146 params.input1_offset = -input1()->zero_point(); // Note the '-'.
147 params.input1_multiplier = input1_multiplier;
148 params.input1_shift = input1_shift;
149 params.input2_offset = -input2()->zero_point(); // Note the '-'.
150 params.input2_multiplier = input2_multiplier;
151 params.input2_shift = input2_shift;
152 params.output_offset = output()->zero_point();
153 params.output_multiplier = output_multiplier;
154 params.output_shift = output_shift;
155 params.quantized_activation_min = activation_min;
156 params.quantized_activation_max = activation_max;
157
158 const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
160
161 if (need_broadcast)
162 {
163 tflite::reference_ops::BroadcastAdd4DSlow(
164 params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
165 getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
166 }
167 else
168 {
169 tflite::reference_ops::Add(params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
170 getTensorShape(input2()), getTensorData<uint8_t>(input2()),
171 getTensorShape(output()), getTensorData<uint8_t>(output()));
172 }
173}
174
175void Add::evalQuantizedS16() const
176{
177 const auto input1_scale = static_cast<double>(input1()->scale());
178 const auto input2_scale = static_cast<double>(input2()->scale());
179 const auto output_scale = static_cast<double>(output()->scale());
180
181 constexpr int left_shift = 12;
182 const double twice_max_input_scale = 2 * std::max(input1_scale, input2_scale);
183 const double real_input1_multiplier = input1_scale / twice_max_input_scale;
184 const double real_input2_multiplier = input2_scale / twice_max_input_scale;
185 const double real_output_multiplier = twice_max_input_scale / ((1 << left_shift) * output_scale);
186
187 int32_t input1_multiplier{}, input2_multiplier{}, output_multiplier{};
188 int input1_shift{}, input2_shift{}, output_shift{};
189 quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &input1_multiplier, &input1_shift);
190 quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &input2_multiplier, &input2_shift);
191 quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &output_multiplier, &output_shift);
192
193 int32_t activation_min{};
194 int32_t activation_max{};
195 calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
196
197 auto fn = [input1_multiplier, input1_shift, //
198 input2_multiplier, input2_shift, //
199 output_multiplier, output_shift, //
200 activation_min, activation_max](int16_t input1_val, int16_t input2_val) {
201 const int32_t shifted_input1_val = static_cast<int32_t>(input1_val) << left_shift;
202 const int32_t shifted_input2_val = static_cast<int32_t>(input2_val) << left_shift;
203 const int32_t scaled_input1_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
204 shifted_input1_val, input1_multiplier, input1_shift);
205 const int32_t scaled_input2_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
206 shifted_input2_val, input2_multiplier, input2_shift);
207 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
208 const int32_t raw_output = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
209 raw_sum, output_multiplier, output_shift);
210 const int32_t clamped_output = std::min(activation_max, std::max(activation_min, raw_output));
211 return static_cast<int16_t>(clamped_output);
212 };
213
214 BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<int16_t>(input1()),
215 getTensorShape(input2()), getTensorData<int16_t>(input2()),
216 getTensorShape(output()), getTensorData<int16_t>(output()), fn);
217}
218
219} // namespace kernels
220} // namespace luci_interpreter
void resize(const Shape &new_shape)
Definition Tensor.cpp:56
float scale() const
Definition Tensor.h:109
int32_t zero_point() const
Definition Tensor.h:115
const Tensor * input2() const
Definition Add.h:34
Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams &params)
Definition Add.cpp:33
void configure() override
Definition Add.cpp:38
Tensor * output() const
Definition Add.h:35
const Tensor * input1() const
Definition Add.h:33
void execute() const override
Definition Add.cpp:53
#define LUCI_INTERPRETER_CHECK(cond)
Definition Utils.h:36
Shape calculateShapeForBroadcast(const Shape &input1_shape, const Shape &input2_shape)
Definition Utils.cpp:204
tflite::RuntimeShape getTensorShape(const Tensor *tensor)
Definition Utils.h:194
void quantizeMultiplierSmallerThanOneExp(double double_multiplier, int32_t *quantized_multiplier, int *left_shift)
Definition Utils.cpp:193
void calculateActivationRangeQuantized(Activation activation, const Tensor *output, int32_t *activation_min, int32_t *activation_max)
Definition Utils.cpp:119
void BinaryOpBroadcastSlow(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data, const tflite::RuntimeShape &unextended_input2_shape, const T *input2_data, const tflite::RuntimeShape &unextended_output_shape, T *output_data, Op op)
int32_t size[5]
Definition Slice.cpp:35