ONE - On-device Neural Engine
Loading...
Searching...
No Matches
UnidirectionalSequenceLSTM.cpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#include "Builders.h"
19#include "kernels/Utils.h"
20
21#include "PALUnidirectionalSequenceLSTM.h"
22
23namespace luci_interpreter
24{
25namespace
26{
27
28#ifndef DIS_QUANT
29
30bool checkedLog2(const float x, int *log2_result)
31{
32 // Using TfLiteRound instead of std::round and std::log instead of
33 // std::log2 to work around these functions being missing in a toolchain
34 // used in some TensorFlow tests as of May 2018.
35 const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
36 const float x_log2_rounded = std::round(x_log2);
37 const float x_log2_fracpart = x_log2 - x_log2_rounded;
38
39 *log2_result = static_cast<int>(x_log2_rounded);
40 return std::abs(x_log2_fracpart) < 1e-3f;
41}
42
43// Create parameters for element wise multiplication that happens in a) cell
44// state update ; b) hidden state update
45// Note that all the output of gates are symmetrically quantized so only scales
46// are required for input. However, during the hidden state update phase, the
47// output is the updated hidden state, which is asymmetrically quantized. Thus
48// output may require zero point
50createInterGateParams(const float input1_scale, const float input2_scale, const float output_scale,
51 const DataType output_type, const int output_zp)
52{
54 if (output_type == DataType::S16)
55 {
56 op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
57 op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();
58 }
59 else if (output_type == DataType::S8)
60 {
61 op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
62 op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
63 }
64
65 op_params.input1_offset = 0; // symmetric
66 op_params.input2_offset = 0; // symmetric
67 op_params.output_offset = output_zp;
68
69 const double input_product_scale =
70 static_cast<double>(input1_scale) * static_cast<double>(input2_scale);
71 double effective_scale = input_product_scale / static_cast<double>(output_scale);
72 auto output_shift = static_cast<int>(op_params.output_shift);
73 kernels::quantizeMultiplier(effective_scale, &op_params.output_multiplier, &output_shift);
74 op_params.output_shift = output_shift;
75 return op_params;
76}
77
78void createGateParams(const circle::Tensor *input, const circle::Tensor *input_weight,
79 const circle::Tensor *input_bias, const circle::Tensor *hidden_state,
80 const circle::Tensor *hidden_state_weight,
81 const float nonlinear_activation_input_scale, const DataType cell_type,
82 lstm::GateParameters *gate_params)
83{
84 // Input CalculateOpDataFullyConnected
85 {
87 double real_multiplier = 0.0;
88 int output_shift;
89 int32_t output_activation_min;
90 int32_t output_activation_max;
91 int32_t output_multiplier;
93 Tensor::scale(input), Tensor::scale(input_weight), nonlinear_activation_input_scale);
94 kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
96 nonlinear_activation_input_scale, cell_type,
97 &output_activation_min, &output_activation_max);
98
99 input_gate_params.output_shift = output_shift;
100 input_gate_params.output_multiplier = output_multiplier;
101 input_gate_params.quantized_activation_max = output_activation_max;
102 input_gate_params.quantized_activation_min = output_activation_min;
103 input_gate_params.input_offset = -Tensor::zero_point(input);
104 input_gate_params.weights_offset = -Tensor::zero_point(input_weight);
105 input_gate_params.output_offset = 0;
106
107 gate_params->input_fc_params = input_gate_params;
108 }
109
110 // Recurrent CalculateOpDataFullyConnected
111 {
112 luci_interpreter_pal::FullyConnectedParams recurrent_gate_params;
113 double real_multiplier = 0.0;
114 int output_shift;
115 int32_t output_activation_min;
116 int32_t output_activation_max;
117 int32_t output_multiplier;
118 real_multiplier = kernels::getQuantizedConvolutionMultipler(Tensor::scale(hidden_state),
119 Tensor::scale(hidden_state_weight),
120 nonlinear_activation_input_scale);
121 kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
123 nonlinear_activation_input_scale, cell_type,
124 &output_activation_min, &output_activation_max);
125
126 recurrent_gate_params.output_shift = output_shift;
127 recurrent_gate_params.output_multiplier = output_multiplier;
128 recurrent_gate_params.quantized_activation_max = output_activation_max;
129 recurrent_gate_params.quantized_activation_min = output_activation_min;
130 recurrent_gate_params.input_offset = -Tensor::zero_point(hidden_state);
131 recurrent_gate_params.weights_offset = -Tensor::zero_point(hidden_state_weight);
132 recurrent_gate_params.output_offset = 0;
133
134 gate_params->recurrent_fc_params = recurrent_gate_params;
135 }
136}
137
138void prepareGateParamsInteger(lstm::LSTMStruct *lstm_struct,
139 lstm::LSTMParameters *quant_lstm_params)
140{
141 float nonlinear_input_scale = 0.00024414062; // 2^-12 Q3.12 -> Q0.15
142
143 createGateParams(lstm_struct->input(), lstm_struct->input_to_forget_weights(),
144 lstm_struct->forget_gate_bias(), lstm_struct->output_state(),
145 lstm_struct->recurrent_to_forget_weights(), nonlinear_input_scale, DataType::S16,
146 &quant_lstm_params->forget_gate_parameters);
147
148 createGateParams(lstm_struct->input(), lstm_struct->input_to_input_weights(),
149 lstm_struct->input_gate_bias(), lstm_struct->output_state(),
150 lstm_struct->recurrent_to_input_weights(), nonlinear_input_scale, DataType::S16,
151 &quant_lstm_params->input_gate_parameters);
152
153 // lstm::GateParameters cell_gate_parameters;
154 createGateParams(lstm_struct->input(), lstm_struct->input_to_cell_weights(),
155 lstm_struct->cell_gate_bias(), lstm_struct->output_state(),
156 lstm_struct->recurrent_to_cell_weights(), nonlinear_input_scale, DataType::S16,
157 &quant_lstm_params->cell_gate_parameters);
158
159 // lstm::GateParameters output_gate_parameters;
160 createGateParams(lstm_struct->input(), lstm_struct->input_to_output_weights(),
161 lstm_struct->output_gate_bias(), lstm_struct->output_state(),
162 lstm_struct->recurrent_to_output_weights(), nonlinear_input_scale, DataType::S16,
163 &quant_lstm_params->output_gate_parameters);
164
165 // Inter gate multiplication parameters
166 float nonlinear_output_scale = 0.00003051757; // 2^-15 Q3.12 -> Q0.15
167 float cell_state_scale =
168 Tensor::scale(lstm_struct->cell_state()); // lstm_tensors.CellStateTensor()->params.scale;
169 // forget gate output (nonlinear output) x cell state -> cell state
170 quant_lstm_params->inter_gate_parameters.forget_cell_mul_params = createInterGateParams(
171 nonlinear_output_scale, cell_state_scale, cell_state_scale, DataType::S16, 0);
172
173 // input gate output x cell gate output -> cell state
174 quant_lstm_params->inter_gate_parameters.input_mul_params = createInterGateParams(
175 nonlinear_output_scale, nonlinear_output_scale, cell_state_scale, DataType::S16, 0);
176
177 // tanh output x output gate output -> hidden state (potentially asymmetric)
178 quant_lstm_params->inter_gate_parameters.output_mul_params = createInterGateParams(
179 nonlinear_output_scale, nonlinear_output_scale, Tensor::scale(lstm_struct->output_state()),
180 Tensor::element_type(lstm_struct->output_state()),
181 Tensor::zero_point(lstm_struct->output_state()));
182}
183
184// Create the additional information about the cell state, which include:
185// cell_state_scale_power: used in integer nonlinear function (e.g., tanh)
186// quantized_cell_clip: quantized cell clip range
187lstm::CellStateInfo createLstmCellStateInfo(const float cell_state_scale, const float cell_clip)
188{
189 lstm::CellStateInfo cell_state_info;
190 // cell_state_scale_power: 2^-cell_state_scale_power = cell state scale
191 int buffer;
192 checkedLog2(cell_state_scale, &buffer);
193 cell_state_info.cell_state_scale_power = buffer;
194 // Cell state specifics
195 cell_state_info.cell_clip = cell_clip;
196 cell_state_info.quantized_cell_clip = static_cast<int16_t>(std::min(
197 std::max(static_cast<double>(cell_clip) / static_cast<double>(cell_state_scale), -32768.0),
198 32767.0));
199 return cell_state_info;
200}
201
202void evalInt8(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
203{
204 lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
205
206 lstm::LSTMParameters quant_lstm_params;
207 prepareGateParamsInteger(&lstm_struct, &quant_lstm_params);
208
209 lstm::CellStateInfo cell_state_info = createLstmCellStateInfo(
210 luci_interpreter::Tensor::scale(lstm_struct.cell_state()), lstm_struct.options->cell_clip());
211
212 const bool time_major = lstm_struct.options->time_major();
213 const auto batch_size =
214 time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
215 const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
216 const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
217
218 auto scratch_0_data =
219 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
220 auto scratch_1_data =
221 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
222 auto scratch_2_data =
223 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
224 auto scratch_3_data =
225 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
226
227 // Create and fill with 0 output state tensor
228 auto output_state_data =
229 std::make_unique<int8_t[]>(Tensor::num_elements(lstm_struct.output_state()));
230 std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
231
232 // Create and fill with 0 cell state tensor
233 auto cell_state_data =
234 std::make_unique<int16_t[]>(Tensor::num_elements(lstm_struct.cell_state()));
235 std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
236
238 &lstm_struct, &quant_lstm_params, &cell_state_info, output_state_data.get(),
239 cell_state_data.get(), kernels::getTensorData<int16_t>(scratch_0_data.get()),
240 kernels::getTensorData<int16_t>(scratch_1_data.get()),
241 kernels::getTensorData<int16_t>(scratch_2_data.get()),
242 kernels::getTensorData<int16_t>(scratch_3_data.get()), runtime_graph);
243}
244
245#endif // DIS_QUANT
246
247#ifndef DIS_FLOAT
249{
252 &op_params.float_activation_max);
255 return op_params;
256}
257
258lstm::GateParameters createGateParamsFloat()
259{
260 lstm::GateParameters gate_params;
261
262 gate_params.input_fc_params = createFcParamsFloat();
263 gate_params.recurrent_fc_params = createFcParamsFloat();
264
265 return gate_params;
266}
267
268lstm::CellStateInfo createLstmCellStateInfoFloat(const float cell_clip)
269{
270 lstm::CellStateInfo cell_state_info;
271 cell_state_info.cell_clip = cell_clip;
272 cell_state_info.cell_state_scale_power = 0; // no quantization
273 cell_state_info.quantized_cell_clip = 0; // no quantization
274 return cell_state_info;
275}
276
277void prepareGateParamsFloat(lstm::LSTMParameters *float_lstm_params)
278{
279 // Gate Parameters
280 float_lstm_params->forget_gate_parameters = createGateParamsFloat();
281 float_lstm_params->input_gate_parameters = createGateParamsFloat();
282 float_lstm_params->cell_gate_parameters = createGateParamsFloat();
283 float_lstm_params->output_gate_parameters = createGateParamsFloat();
284
285 // Inter gate multiplication parameters
288 &op_params.float_activation_max);
291 float_lstm_params->inter_gate_parameters.forget_cell_mul_params = op_params;
292 float_lstm_params->inter_gate_parameters.input_mul_params = op_params;
293 float_lstm_params->inter_gate_parameters.output_mul_params = op_params;
294}
295
296void evalFloat(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
297{
298 lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
299
300 lstm::CellStateInfo cell_state_info =
301 createLstmCellStateInfoFloat(lstm_struct.options->cell_clip());
302
303 lstm::LSTMParameters lstm_params;
304 prepareGateParamsFloat(&lstm_params);
305
306 const bool time_major = lstm_struct.options->time_major();
307 const auto batch_size =
308 time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
309 const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
310 const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
311
312 auto scratch_0_data =
313 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
314 auto scratch_1_data =
315 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
316 auto scratch_2_data =
317 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
318 auto scratch_3_data =
319 std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
320
321 // Create and fill with 0 output state tensor
322 auto output_state_data =
323 std::make_unique<float[]>(Tensor::num_elements(lstm_struct.output_state()));
324 std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
325
326 // Create and fill with 0 cell state tensor
327 auto cell_state_data = std::make_unique<float[]>(Tensor::num_elements(lstm_struct.cell_state()));
328 std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
329
330 luci_interpreter_pal::evalLSTM<float, float, float, float>(
331 &lstm_struct, &lstm_params, &cell_state_info, output_state_data.get(), cell_state_data.get(),
332 kernels::getTensorData<float>(scratch_0_data.get()),
333 kernels::getTensorData<float>(scratch_1_data.get()),
334 kernels::getTensorData<float>(scratch_2_data.get()),
335 kernels::getTensorData<float>(scratch_3_data.get()), runtime_graph);
336}
337#endif // DIS_FLOAT
338
339void validateWeightTensorSize(const circle::Tensor *weight_tensor, int dim1_size, int dim2_size)
340{
341 LUCI_INTERPRETER_CHECK(Tensor::num_dims(weight_tensor) == 2);
342 LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 0) == dim1_size);
343 LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 1) == dim2_size);
344}
345
346void validateTensorsSize(lstm::LSTMStruct *lstm_struct, const bool time_major)
347{
348 const auto batch_size =
349 time_major ? Tensor::dim(lstm_struct->input(), 1) : Tensor::dim(lstm_struct->input(), 0);
350
351 const auto input_dimension = Tensor::dim(lstm_struct->input(), 2);
352 const auto state_dimension = Tensor::dim(lstm_struct->output_state(), 1);
353
354 // Input FC weights
355 for (int32_t i = 1; i < 5; i++)
356 {
357 validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, input_dimension);
358 }
359
360 // Recurrent FC weights
361 for (int32_t i = 5; i < 9; i++)
362 {
363 validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, state_dimension);
364 }
365
366 // Biases
367 for (int32_t i = 12; i < 16; i++)
368 {
369 LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->get_internal_tensor(i)) == 1);
370 LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->get_internal_tensor(i), 0) == state_dimension);
371 }
372
373 // Check the shape of input state tensors.
374 // These tensor may be 1D or 2D. It's fine as long as the total size is
375 // correct.
376 LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->output_state()) ==
377 batch_size * state_dimension);
378 LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->cell_state()) ==
379 batch_size * state_dimension);
380
381 // Check the shape of output tensor against that of input tensor
382 LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->output()) == 3);
383 LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 0) ==
384 Tensor::dim(lstm_struct->output(), 0));
385 LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 1) ==
386 Tensor::dim(lstm_struct->output(), 1));
387 LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->output(), 2) == state_dimension);
388}
389
390} // namespace
391
392void configure_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
393 BaseRuntimeGraph *runtime_graph)
394{
395 lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
396
397 LUCI_INTERPRETER_CHECK(Tensor::element_type(lstm_struct.input()) == DataType::FLOAT32 or
398 Tensor::element_type(lstm_struct.input()) == DataType::S8);
399
400 lstm_struct.validateTensorTypes();
401
402 const bool time_major = lstm_struct.options->time_major();
403
404 validateTensorsSize(&lstm_struct, time_major);
405
406 // No peephole
407 for (int32_t i = 9; i < 12; ++i)
408 LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
409
410 // No projection
411 for (int32_t i = 16; i < 18; ++i)
412 LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
413
414 // No internal layer norm
415 for (int32_t i = 20; i < 24; ++i)
416 LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
417}
418
419void execute_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
420 BaseRuntimeGraph *runtime_graph)
421{
422 const auto input_index = cur_op->inputs()->operator[](0);
423 assert(input_index != -1);
424
425 bool is_inplace = runtime_graph->is_inplace_op(cur_op);
426
427 const auto input = runtime_graph->getCircleTensorByIndex(input_index);
428
429 switch (Tensor::element_type(input))
430 {
431#ifndef DIS_FLOAT
432 case DataType::FLOAT32:
433 evalFloat(cur_op, runtime_graph, is_inplace);
434 break;
435#endif // DIS_FLOAT
436#ifndef DIS_QUANT
437 case DataType::S8:
438 evalInt8(cur_op, runtime_graph, is_inplace);
439 break;
440#endif // DIS_QUANT
441 default:
442 assert(false && "Unsupported type.");
443 }
444}
445
446} // namespace luci_interpreter
const circle::Tensor * getCircleTensorByIndex(int32_t index)
bool is_inplace_op(const circle::Operator *op)
#define LUCI_INTERPRETER_CHECK(cond)
Definition Utils.h:36
DataType
"scalar" value type
Definition DataType.h:27
bool checkedLog2(const float x, int *log2_result)
Definition Utils.cpp:113
void calculateActivationRange(Activation activation, T *activation_min, T *activation_max)
Definition Utils.cpp:52
void calculateActivationRangeQuantized(Activation activation, const Tensor *output, int32_t *activation_min, int32_t *activation_max)
Definition Utils.cpp:119
double getQuantizedConvolutionMultipler(float input_scale, float filter_scale, float output_scale)
Definition Utils.h:137
void quantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.cpp:157
void evalLSTM< int8_t, int8_t, int16_t, int32_t >(luci_interpreter::lstm::LSTMStruct *lstm_struct, luci_interpreter::lstm::LSTMParameters *lstm_params, luci_interpreter::lstm::CellStateInfo *cell_state_info, int8_t *output_state_data, int16_t *cell_state_data, int16_t *scratch0, int16_t *scratch1, int16_t *scratch2, int16_t *scratch3, luci_interpreter::BaseRuntimeGraph *runtime_graph)
RuntimeGraph BaseRuntimeGraph
void configure_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
size_t getDataTypeSize(DataType data_type)
Definition DataType.h:33
void execute_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
const loco::Dimension & dim(uint32_t axis) const
Definition Tensor.h:44
const circle::UnidirectionalSequenceLSTMOptions * options
const circle::Tensor * get_internal_tensor(int i)