ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PALreference_ops.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
17
18Licensed under the Apache License, Version 2.0 (the "License");
19you may not use this file except in compliance with the License.
20You may obtain a copy of the License at
21
22 http://www.apache.org/licenses/LICENSE-2.0
23
24Unless required by applicable law or agreed to in writing, software
25distributed under the License is distributed on an "AS IS" BASIS,
26WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
27See the License for the specific language governing permissions and
28limitations under the License.
29==============================================================================*/
30#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
31#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
32
33#include <stdint.h>
34#include <sys/types.h>
35
36#include <algorithm>
37#include <cmath>
38#include <cstring>
39#include <functional>
40#include <limits>
41#include <memory>
42#include <type_traits>
43
44#include "third_party/eigen3/Eigen/Core"
45#include "fixedpoint/fixedpoint.h"
46#include "ruy/profiler/instrumentation.h" // from @ruy
47#include "tensorflow/lite/c/common.h"
48#include "tensorflow/lite/kernels/internal/common.h"
49#include "tensorflow/lite/kernels/internal/quantization_util.h"
50#include "tensorflow/lite/kernels/internal/reference/add.h"
51#include "tensorflow/lite/kernels/internal/reference/add_n.h"
52#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
53#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
54#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
55#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
56#include "tensorflow/lite/kernels/internal/reference/cast.h"
57#include "tensorflow/lite/kernels/internal/reference/ceil.h"
58#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
59#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
60#include "tensorflow/lite/kernels/internal/reference/conv.h"
61#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
62#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
63#include "tensorflow/lite/kernels/internal/reference/div.h"
64#include "tensorflow/lite/kernels/internal/reference/elu.h"
65#include "tensorflow/lite/kernels/internal/reference/exp.h"
66#include "tensorflow/lite/kernels/internal/reference/fill.h"
67#include "tensorflow/lite/kernels/internal/reference/floor.h"
68#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
69#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
70#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
71#include "tensorflow/lite/kernels/internal/reference/gather.h"
72#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
73#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
74#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
75#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
76#include "tensorflow/lite/kernels/internal/reference/logistic.h"
77#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
78#include "tensorflow/lite/kernels/internal/reference/mul.h"
79#include "tensorflow/lite/kernels/internal/reference/neg.h"
80#include "tensorflow/lite/kernels/internal/reference/pad.h"
81#include "tensorflow/lite/kernels/internal/reference/pooling.h"
82#include "tensorflow/lite/kernels/internal/reference/prelu.h"
83#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
84#include "tensorflow/lite/kernels/internal/reference/quantize.h"
85#include "tensorflow/lite/kernels/internal/reference/reduce.h"
86#include "tensorflow/lite/kernels/internal/reference/requantize.h"
87#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
88#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
89#include "tensorflow/lite/kernels/internal/reference/round.h"
90#include "tensorflow/lite/kernels/internal/reference/softmax.h"
91#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
92#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
93#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
94#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
95#include "tensorflow/lite/kernels/internal/reference/sub.h"
96#include "tensorflow/lite/kernels/internal/reference/tanh.h"
97#include "tensorflow/lite/kernels/internal/reference/transpose.h"
98#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
99#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
100#include "tensorflow/lite/kernels/internal/tensor.h"
101#include "tensorflow/lite/kernels/internal/types.h"
102namespace tflite
103{
104
105namespace reference_ops
106{
107
108template <typename T>
109inline void Relu(const RuntimeShape &input_shape, const T *input_data,
110 const RuntimeShape &output_shape, T *output_data)
111{
112 const int flat_size = MatchingFlatSize(input_shape, output_shape);
113 for (int i = 0; i < flat_size; ++i)
114 {
115 const T val = input_data[i];
116 const T lower = 0;
117 const T clamped = val < lower ? lower : val;
118 output_data[i] = clamped;
119 }
120}
121
122template <typename T>
123inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
124 const RuntimeShape &output_shape, T *output_data)
125{
126 ruy::profiler::ScopeLabel label("Relu1 (not fused)");
127 const int flat_size = MatchingFlatSize(input_shape, output_shape);
128 for (int i = 0; i < flat_size; ++i)
129 {
130 const T val = input_data[i];
131 const T upper = 1;
132 const T lower = -1;
133 const T clamped = val > upper ? upper : val < lower ? lower : val;
134 output_data[i] = clamped;
135 }
136}
137
138inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
139 const RuntimeShape &output_shape, float *output_data)
140{
141 ruy::profiler::ScopeLabel label("Relu6 (not fused)");
142 const int flat_size = MatchingFlatSize(input_shape, output_shape);
143 for (int i = 0; i < flat_size; ++i)
144 {
145 const float val = input_data[i];
146 const float upper = 6;
147 const float lower = 0;
148 const float clamped = val > upper ? upper : val < lower ? lower : val;
149 output_data[i] = clamped;
150 }
151}
152
153template <typename T>
154inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
155 const T *input_data, const RuntimeShape &output_shape, T *output_data)
156{
157 ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
158 const int flat_size = MatchingFlatSize(input_shape, output_shape);
159 for (int i = 0; i < flat_size; ++i)
160 {
161 const int32 val = static_cast<int32_t>(input_data[i]);
162 int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
163 params.output_multiplier,
164 params.output_shift);
165 clamped = std::max(params.quantized_activation_min, clamped);
166 clamped = std::min(params.quantized_activation_max, clamped);
167 output_data[i] = static_cast<T>(clamped);
168 }
169}
170
171template <typename T>
172inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
173 const T *input_data, const RuntimeShape &output_shape, T *output_data)
174{
175 ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
176 const int flat_size = MatchingFlatSize(input_shape, output_shape);
177 const T max_value = params.quantized_activation_max;
178 const T min_value = params.quantized_activation_min;
179 for (int i = 0; i < flat_size; ++i)
180 {
181 const T val = input_data[i];
182 const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
183 output_data[i] = clamped;
184 }
185}
186
187// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
188// dimensionality if the runtime code does a single loop over one dimension
189// that handles broadcasting as the base case. The code generator would then
190// generate max(D1, D2) nested for loops.
191inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
192 const RuntimeShape &unswitched_input1_shape,
193 const uint8 *unswitched_input1_data,
194 const RuntimeShape &unswitched_input2_shape,
195 const uint8 *unswitched_input2_data,
196 const RuntimeShape &output_shape, uint8 *output_data)
197{
198 ArithmeticParams switched_params = unswitched_params;
199 switched_params.input1_offset = unswitched_params.input2_offset;
200 switched_params.input2_offset = unswitched_params.input1_offset;
201
202 const bool use_unswitched = unswitched_params.broadcast_category ==
203 tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
204
205 const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
206 const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
207 const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
208
209 // Fivefold nested loops. The second input resets its position for each
210 // iteration of the second loop. The first input resets its position at the
211 // beginning of the fourth loop. The innermost loop is an elementwise Mul of
212 // sections of the arrays.
213 uint8 *output_data_ptr = output_data;
214 const uint8 *input1_data_ptr = input1_data;
215 const uint8 *input2_data_reset = input2_data;
216 int y0 = params.broadcast_shape[0];
217 int y1 = params.broadcast_shape[1];
218 int y2 = params.broadcast_shape[2];
219 int y3 = params.broadcast_shape[3];
220 int y4 = params.broadcast_shape[4];
221 for (int i0 = 0; i0 < y0; ++i0)
222 {
223 const uint8 *input2_data_ptr;
224 for (int i1 = 0; i1 < y1; ++i1)
225 {
226 input2_data_ptr = input2_data_reset;
227 for (int i2 = 0; i2 < y2; ++i2)
228 {
229 for (int i3 = 0; i3 < y3; ++i3)
230 {
231 MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
232 input2_data_ptr += y4;
233 output_data_ptr += y4;
234 }
235 input1_data_ptr += y4;
236 }
237 }
238 input2_data_reset = input2_data_ptr;
239 }
240}
241
242inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
243 const int16 *input1_data, const RuntimeShape &input2_shape,
244 const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
245{
246 ruy::profiler::ScopeLabel label("Mul/Int16");
247
248 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
249
250 for (int i = 0; i < flat_size; i++)
251 {
252 // F0 uses 0 integer bits, range [-1, 1].
253 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
254
255 F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
256 output_data[i] = unclamped_result.raw();
257 }
258}
259
260inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
261 const int16 *input1_data, const RuntimeShape &input2_shape,
262 const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
263{
264 ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
265 int32 output_offset = params.output_offset;
266 int32 output_activation_min = params.quantized_activation_min;
267 int32 output_activation_max = params.quantized_activation_max;
268 TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
269
270 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
271
272 for (int i = 0; i < flat_size; i++)
273 {
274 // F0 uses 0 integer bits, range [-1, 1].
275 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
276
277 F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
278 int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
279 int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
280 clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
281 output_data[i] = output_offset + clamped_result;
282 }
283}
284
285inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
286 const int16_t *input1_data, const RuntimeShape &input2_shape,
287 const int16_t *input2_data, const RuntimeShape &output_shape,
288 int16_t *output_data)
289{
290 ruy::profiler::ScopeLabel label("Sub/Int16");
291 const int input1_shift = params.input1_shift;
292 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
293 const int16 output_activation_min = params.quantized_activation_min;
294 const int16 output_activation_max = params.quantized_activation_max;
295
296 TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
297 TFLITE_DCHECK_LE(input1_shift, 0);
298 TFLITE_DCHECK_LE(params.input2_shift, 0);
299 const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
300 const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
301 const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
302
303 if (input1_shift == 0)
304 {
305 // F0 uses 0 integer bits, range [-1, 1].
306 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
307 for (int i = 0; i < flat_size; ++i)
308 {
309 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
310 F0 scaled_input =
311 F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
312 F0 result = SaturatingSub(input_ready_scaled, scaled_input);
313 const int16 raw_output = result.raw();
314 const int16 clamped_output =
315 std::min(output_activation_max, std::max(output_activation_min, raw_output));
316 output_data[i] = clamped_output;
317 }
318 }
319 else
320 {
321 // F0 uses 0 integer bits, range [-1, 1].
322 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
323 for (int i = 0; i < flat_size; ++i)
324 {
325 F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
326 F0 scaled_input =
327 F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
328 F0 result = SaturatingSub(scaled_input, input_ready_scaled);
329 const int16 raw_output = result.raw();
330 const int16 clamped_output =
331 std::min(output_activation_max, std::max(output_activation_min, raw_output));
332 output_data[i] = clamped_output;
333 }
334 }
335}
336
337template <typename Scalar>
338void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
339 const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
340{
341 ruy::profiler::ScopeLabel label("Pack");
342 const int dimensions = output_shape.DimensionsCount();
343 int axis = params.axis;
344 int inputs_count = params.inputs_count;
345
346 int outer_size = 1;
347 for (int i = 0; i < axis; i++)
348 {
349 outer_size *= output_shape.Dims(i);
350 }
351 int copy_size = 1;
352 for (int i = params.axis + 1; i < dimensions; i++)
353 {
354 copy_size *= output_shape.Dims(i);
355 }
356 TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
357
358 for (int i = 0; i < inputs_count; ++i)
359 {
360 for (int k = 0; k < outer_size; k++)
361 {
362 const Scalar *input_ptr = input_data[i] + copy_size * k;
363 int loc = k * inputs_count * copy_size + i * copy_size;
364 memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
365 }
366 }
367}
368
369template <typename Scalar>
370void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
371 const RuntimeShape &output_shape, Scalar *const *output_datas)
372{
373 ruy::profiler::ScopeLabel label("Unpack");
374 const int dimensions = input_shape.DimensionsCount();
375 const int outputs_count = params.num_split;
376
377 int outer_size = 1;
378 int axis = params.axis;
379 if (axis < 0)
380 {
381 axis += dimensions;
382 }
383 TFLITE_DCHECK_GE(axis, 0);
384 TFLITE_DCHECK_LT(axis, dimensions);
385 for (int i = 0; i < axis; ++i)
386 {
387 outer_size *= input_shape.Dims(i);
388 }
389 int copy_size = 1;
390 for (int i = axis + 1; i < dimensions; ++i)
391 {
392 copy_size *= input_shape.Dims(i);
393 }
394 TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
395
396 for (int i = 0; i < outputs_count; ++i)
397 {
398 for (int k = 0; k < outer_size; k++)
399 {
400 Scalar *output_ptr = output_datas[i] + copy_size * k;
401 int loc = k * outputs_count * copy_size + i * copy_size;
402 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
403 }
404 }
405}
406
407template <typename Scalar>
408void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
409 const uint8 *const *input_data, const RuntimeShape &output_shape,
410 uint8 *output_data)
411{
412 ruy::profiler::ScopeLabel label("PackWithScaling");
413 const int dimensions = output_shape.DimensionsCount();
414 int axis = params.axis;
415 const int32 *input_zeropoint = params.input_zeropoint;
416 const float *input_scale = params.input_scale;
417 int inputs_count = params.inputs_count;
418 const int32 output_zeropoint = params.output_zeropoint;
419 const float output_scale = params.output_scale;
420
421 int outer_size = 1;
422 for (int i = 0; i < axis; i++)
423 {
424 outer_size *= output_shape.Dims(i);
425 }
426 int copy_size = 1;
427 for (int i = axis + 1; i < dimensions; i++)
428 {
429 copy_size *= output_shape.Dims(i);
430 }
431 TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
432
433 Scalar *output_ptr = output_data;
434 const float inverse_output_scale = 1.f / output_scale;
435 for (int k = 0; k < outer_size; k++)
436 {
437 for (int i = 0; i < inputs_count; ++i)
438 {
439 if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
440 {
441 memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
442 }
443 else
444 {
445 assert(false);
446 const float scale = input_scale[i] * inverse_output_scale;
447 const float bias = -input_zeropoint[i] * scale;
448 auto input_ptr = input_data[i];
449 for (int j = 0; j < copy_size; ++j)
450 {
451 const int value =
452 static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
453 output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
454 }
455 }
456 output_ptr += copy_size;
457 }
458 }
459}
460
461template <typename Scalar>
462void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
463 const Scalar *const *input_data, const RuntimeShape &output_shape,
464 Scalar *output_data)
465{
466 ruy::profiler::ScopeLabel label("DepthConcatenation");
467 auto params_copy = params;
468 params_copy.axis = 3;
469 Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
470}
471
472inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
473 const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
474 const float *prev_activ_data, const RuntimeShape &weights_shape,
475 const float *weights_data, const RuntimeShape &unextended_bias_shape,
476 const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
477 const float *prev_state_data,
478 const RuntimeShape &unextended_output_state_shape, float *output_state_data,
479 const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
480 const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
481 const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
482{
483 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
484 TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
485 TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
486 TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
487 TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
488 TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
489 TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
490 TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
491 const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
492 const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
493 const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
494 const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
495 const RuntimeShape output_state_shape =
496 RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
497 const RuntimeShape output_activ_shape =
498 RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
499 const RuntimeShape concat_temp_shape =
500 RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
501 const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
502 TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
503
504 const int weights_dim_count = weights_shape.DimensionsCount();
505 const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
506 output_state_shape, 0, output_activ_shape, 0);
507 const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
508 output_state_shape, 1, output_activ_shape, 1);
509 const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
510 output_state_shape, 2, output_activ_shape, 2);
511 const int input_depth = input_shape.Dims(3);
512 const int prev_activ_depth = prev_activ_shape.Dims(3);
513 const int total_input_depth = prev_activ_depth + input_depth;
514 TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
515 TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
516 const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
517 TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
518 TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
519 const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
520 3, output_activ_shape, 3);
521 TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
522
523 // Concatenate prev_activ and input data together
524 std::vector<float const *> concat_input_arrays_data;
525 std::vector<RuntimeShape const *> concat_input_arrays_shapes;
526 concat_input_arrays_data.push_back(input_data);
527 concat_input_arrays_data.push_back(prev_activ_data);
528 concat_input_arrays_shapes.push_back(&input_shape);
529 concat_input_arrays_shapes.push_back(&prev_activ_shape);
530 tflite::ConcatenationParams concat_params;
531 concat_params.axis = 3;
532 concat_params.inputs_count = concat_input_arrays_data.size();
533 Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
534 concat_temp_shape, concat_temp_data);
535
536 // Fully connected
537 tflite::FullyConnectedParams fc_params;
538 fc_params.float_activation_min = std::numeric_limits<float>::lowest();
539 fc_params.float_activation_max = std::numeric_limits<float>::max();
540 FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
541 bias_shape, bias_data, activ_temp_shape, activ_temp_data);
542
543 // Memory state update (the LSTM "guts")
544 for (int b = 0; b < batches; ++b)
545 {
546 for (int w = 0; w < width; ++w)
547 {
548 for (int h = 0; h < height; ++h)
549 {
550 for (int c = 0; c < output_depth; ++c)
551 {
552 const float input_gate =
553 1.f /
554 (1.f +
555 std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
556 const float new_input =
557 std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
558 const float forget_gate =
559 1.f /
560 (1.f +
561 std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
562 const float output_gate =
563 1.f /
564 (1.f +
565 std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
566 const float new_state =
567 input_gate * new_input +
568 forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
569 output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
570 output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
571 output_gate * std::tanh(new_state);
572 }
573 }
574 }
575 }
576}
577
578// Quantized LSTM cell implementation.
579// The quantization of the input, output arrays is as follows:
580// - The input activations are quantized as uint8 on the interval
581// [-1, 127/128].
582// The rationale for that is that is the natural interval for output
583// activations (see next point) and these need to be concatenated together.
584// We could accommodate different ranges by re-scaling, but we empirically
585// found that setting the input activations range to be [-1, 127/128] in the
586// first place, removing the need for re-scaling, greatly improves accuracy.
587// - The output activations are quantized as uint8 on the interval
588// [-1, 127/128].
589// The rationale for that is that the definition of a LSTM cell makes them
590// intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
591// makes for simpler, more accurate fixed-point arithmetic.
592// - The output-at-previous-timestep state array is obviously quantized as
593// the output activations.
594// - The internal LSTM memory (not the output-at-previous-timestep, the other
595// internal state array) is int16-quantized and may use any power-of-two,
596// symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
597// StateIntegerBits below, see the below discussion of that template
598// parameter ("The StateIntegerBits template parameter").
599// - The output of the internal fully-connected node is int16-quantized
600// on the interval [-8, 8 * 32767/32768], the rationale for which is
601// explained just below ("Why [-8, 8] for fully-connected output?").
602//
603//
604// === The StateIntegerBits template parameter ===
605//
606// The StateIntegerBits template parameter controls the fixed-point format used
607// to represent the internal memory of the LSTM cell (not the
608// output-at-previous-timestep, the other internal state array). It's currently
609// a template parameter so that the model can control that. The most typical
610// value for StateIntegerBits is 4. Other plausible values are anywhere between
611// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
612// and drop that template parameter. The reason why it can't be a runtime
613// parameter is that this controls the fixed-point format used, i.e. we need to
614// generate actually different code based on it. In particular, we generate code
615// for a fixed-point tanh() implementation for that format, which internally
616// uses a fixed-point exp() implementation, which internally uses a
617// barrel-shifter with a number of steps that depends on StateIntegerBits.
618// Another consequence of that is that a higher value of StateIntegerBits
619// results in a more expensive implementation (more barrel shifter steps
620// needed).
621//
622//
623// === Why [-8, 8] for fully-connected output? ===
624//
625// This array is only fed to Logistic and Tanh functions, for which
626// the quantized implementation will want to use fixed-point arithmetic,
627// requiring a power-of-two representation interval. Thus, we should right
628// away quantize this array to a power-of-two interval; otherwise,
629// implementation will need to rescale that, losing any benefit that a tighter
630// representation interval might otherwise yield, while introducing some
631// numerical error and computational overhead.
632//
633// Now, Logistic and Tanh
634// are nearly constant (nearly equal to their horizontal asymptotes)
635// outside of a small bounded interval around 0:
636//
637// Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4
638// Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7
639// Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14
640//
641// From this, we see that clamping to [-4, 4] would be too inaccurate
642// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
643// while clamping to [-16, 16] would make no difference even in float32.
644// However, for a fixed-point implementation in 16-bit integers, using 5
645// integer bits to represent the [-16, 16] range would leave only 11
646// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
647// representable values. Notice that is higher than the
648// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
649// Using [-8, 8] thus seems like the better compromise overall, enjoying
650// an increment of 2.4e-4 between representable values and a worst-case
651// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
652// [-16, 16].
653//
654// Moreover, all other things being equal, it is nice to choose the narrower
655// representation range, as that makes the implementation of fixed-point
656// math functions a little cheaper (each integer bit requires an additional
657// barrel-shifter atep in the implementation of exp(-x)). That is further
658// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
659// sense for 32-bit float or 32-bit fixed-point quantization, but we are
660// aiming for 16-bit fixed-point quantization of these internal nodes here.
661//
662template <int StateIntegerBits>
663inline void
664LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
665 const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
666 const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
667 const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
668 const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
669 const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
670 int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
671 uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
672 uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
673 int16 *activ_temp_data_int16, void *gemmlowp_context)
674{
675 (void)gemmlowp_context; // only used in optimized code.
676 int32 weights_zero_point = params.weights_zero_point;
677 int32 accum_multiplier = params.accum_multiplier;
678 int accum_shift = params.accum_shift;
679 TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
680 TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
681 TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
682 TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
683 TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
684 TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
685 TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
686 TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
687 const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
688 const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
689 const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
690 const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
691 const RuntimeShape output_state_shape =
692 RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
693 const RuntimeShape output_activ_shape =
694 RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
695 const RuntimeShape concat_temp_shape =
696 RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
697 const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
698 TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
699
700 // Gather dimensions information, and perform consistency checks.
701 const int weights_dim_count = weights_shape.DimensionsCount();
702 const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
703 output_state_shape, output_activ_shape);
704 const int input_depth = input_shape.Dims(3);
705 const int prev_activ_depth = prev_activ_shape.Dims(3);
706 const int total_input_depth = prev_activ_depth + input_depth;
707 TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
708 const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
709 TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
710 TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
711 TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
712 const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
713 3, output_activ_shape, 3);
714 TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
715 const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
716 const int fc_output_depth =
717 MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
718 const int fc_accum_depth = total_input_depth;
719 TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
720
721 // Depth-concatenate prev_activ and input data together.
722 uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
723 const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
724 tflite::ConcatenationParams concat_params;
725 concat_params.axis = 3;
726 concat_params.inputs_count = 2;
727 Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
728 concat_temp_shape, concat_temp_data_uint8);
729
730 // Implementation of the fully connected node inside the LSTM cell.
731 // The operands are 8-bit integers, the accumulators are internally 32bit
732 // integers, and the output is 16-bit fixed-point with 3 integer bits so
733 // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
734 // is explained in the function comment above.
735 for (int b = 0; b < fc_batches; ++b)
736 {
737 for (int out_c = 0; out_c < fc_output_depth; ++out_c)
738 {
739 // Internal accumulation.
740 // Initialize accumulator with the bias-value.
741 int32 accum = bias_data_int32[out_c];
742 // Accumulation loop.
743 for (int d = 0; d < fc_accum_depth; ++d)
744 {
745 int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
746 int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
747 accum += input_val * weights_val;
748 }
749 // Down-scale the final int32 accumulator to the scale used by our
750 // (16-bit, using 3 integer bits) fixed-point format. The quantized
751 // multiplier and shift here have been pre-computed offline
752 // (e.g. by toco).
753 accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
754 // Saturate, cast to int16, and store to the temporary activations array.
755 accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
756 activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
757 }
758 }
759
760 // Rest of the LSTM cell: tanh and logistic math functions, and some adds
761 // and muls, all done in 16-bit fixed-point.
762 for (int b = 0; b < outer_size; ++b)
763 {
764 for (int c = 0; c < output_depth; ++c)
765 {
766 // Define the fixed-point data types that we will use here. All use
767 // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
768 // They only differ by the number of integral vs. fractional bits,
769 // determining the range of values that they can represent.
770 //
771 // F0 uses 0 integer bits, range [-1, 1].
772 // This is the return type of math functions such as tanh, logistic,
773 // whose range is in [-1, 1].
774 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
775 // F3 uses 3 integer bits, range [-8, 8].
776 // This is the range of the previous fully-connected node's output,
777 // which is our input here.
778 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
779 // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
780 // 2^StateIntegerBits]. It's used to represent the internal state, whose
781 // number of integer bits is currently dictated by the model. See comment
782 // on the StateIntegerBits template parameter above.
783 using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
784 // Implementation of input gate, using fixed-point logistic function.
785 F3 input_gate_input =
786 F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
787 F0 input_gate_output = gemmlowp::logistic(input_gate_input);
788 // Implementation of input modulation gate, using fixed-point tanh
789 // function.
790 F3 input_modulation_gate_input =
791 F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
792 F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
793 // Implementation of forget gate, using fixed-point logistic function.
794 F3 forget_gate_input =
795 F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
796 F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
797 // Implementation of output gate, using fixed-point logistic function.
798 F3 output_gate_input =
799 F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
800 F0 output_gate_output = gemmlowp::logistic(output_gate_input);
801 // Implementation of internal multiplication nodes, still in fixed-point.
802 F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
803 FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
804 FS prev_state_times_forget_state = forget_gate_output * prev_state;
805 // Implementation of internal addition node, saturating.
806 FS new_state =
807 gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
808 prev_state_times_forget_state);
809 // Implementation of last internal Tanh node, still in fixed-point.
810 // Since a Tanh fixed-point implementation is specialized for a given
811 // number or integer bits, and each specialization can have a substantial
812 // code size, and we already used above a Tanh on an input with 3 integer
813 // bits, and per the table in the above function comment there is no
814 // significant accuracy to be lost by clamping to [-8, +8] for a
815 // 3-integer-bits representation, let us just do that. This helps people
816 // porting this to targets where code footprint must be minimized.
817 F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
818 F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
819 // Store the new internal state back to memory, as 16-bit integers.
820 // Note: here we store the original value with StateIntegerBits, not
821 // the rescaled 3-integer-bits value fed to tanh.
822 output_state_data_int16[b * output_depth + c] = new_state.raw();
823 // Down-scale the output activations to 8-bit integers, saturating,
824 // and store back to memory.
825 int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
826 int16 clamped_output_activ =
827 std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
828 output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
829 }
830 }
831}
832
833template <typename Scalar>
834void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
835 const RuntimeShape *const *output_shapes, Scalar *const *output_data)
836{
837 ruy::profiler::ScopeLabel label("Split");
838 const int split_dimensions = input_shape.DimensionsCount();
839 int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
840 int outputs_count = params.num_split;
841 TFLITE_DCHECK_LT(axis, split_dimensions);
842
843 int64_t split_size = 0;
844 for (int i = 0; i < outputs_count; i++)
845 {
846 TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
847 for (int j = 0; j < split_dimensions; j++)
848 {
849 if (j != axis)
850 {
851 MatchingDim(*output_shapes[i], j, input_shape, j);
852 }
853 }
854 split_size += output_shapes[i]->Dims(axis);
855 }
856 TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
857 int64_t outer_size = 1;
858 for (int i = 0; i < axis; ++i)
859 {
860 outer_size *= input_shape.Dims(i);
861 }
862 // For all output arrays,
863 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
864 int64_t base_inner_size = 1;
865 for (int i = axis + 1; i < split_dimensions; ++i)
866 {
867 base_inner_size *= input_shape.Dims(i);
868 }
869
870 const Scalar *input_ptr = input_data;
871 for (int k = 0; k < outer_size; k++)
872 {
873 for (int i = 0; i < outputs_count; ++i)
874 {
875 const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
876 memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
877 input_ptr += copy_size;
878 }
879 }
880}
881
882inline int NodeOffset(int b, int h, int w, int height, int width)
883{
884 return (b * height + h) * width + w;
885}
886
887inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
888 const RuntimeShape &input_shape, const float *input_data,
889 const RuntimeShape &output_shape, float *output_data)
890{
891 const int trailing_dim = input_shape.DimensionsCount() - 1;
892 const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
893 const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
894
895 for (int i = 0; i < outer_size; ++i)
896 {
897 for (int c = 0; c < depth; ++c)
898 {
899 const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
900 const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
901 float accum = 0.f;
902 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
903 {
904 const float input_val = input_data[i * depth + input_c];
905 accum += input_val * input_val;
906 }
907 const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
908 output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
909 }
910 }
911}
912
913inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
914 const RuntimeShape &output_shape, float *output_data)
915{
916 const int flat_size = MatchingFlatSize(input_shape, output_shape);
917 for (int i = 0; i < flat_size; i++)
918 {
919 output_data[i] = static_cast<float>(input_data[i]);
920 }
921}
922
923inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
924 const float *input_data, const RuntimeShape &output_shape, float *output_data)
925{
926 ruy::profiler::ScopeLabel label("FakeQuant");
927 float rmin = op_params.minmax.min;
928 float rmax = op_params.minmax.max;
929 int num_bits = op_params.num_bits;
930 // 0 should always be a representable value. Let's assume that the initial
931 // min,max range contains 0.
932 TFLITE_DCHECK_LE(rmin, 0.0f);
933 TFLITE_DCHECK_GE(rmax, 0.0f);
934 TFLITE_DCHECK_LT(rmin, rmax);
935
936 // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
937 int quant_min = 0;
938 int quant_max = (1 << num_bits) - 1;
939 float nudged_min, nudged_max, nudged_scale;
940 NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
941 const int flat_size = MatchingFlatSize(input_shape, output_shape);
942 FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
943}
944
945// Common subroutine for both `GatherNd` and `GatherNdString`.
947{
951 std::vector<int> dims_to_count;
952};
953
954// Returns common values being used on both `GatherNd` and `GatherNdString`.
955inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
956 const RuntimeShape &indices_shape)
957{
959 ret.n_slices = 1;
960 ret.slice_size = 1;
961 const int indices_dims = indices_shape.DimensionsCount();
962 ret.indices_nd = indices_shape.Dims(indices_dims - 1);
963 const int params_dims = params_shape.DimensionsCount();
964 for (int i = 0; i < indices_dims - 1; ++i)
965 {
966 ret.n_slices *= indices_shape.Dims(i);
967 }
968 for (int i = ret.indices_nd; i < params_dims; ++i)
969 {
970 ret.slice_size *= params_shape.Dims(i);
971 }
972
973 int remain_flat_size = params_shape.FlatSize();
974 ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
975 for (int i = 0; i < ret.indices_nd; ++i)
976 {
977 ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
978 remain_flat_size = ret.dims_to_count[i];
979 }
980
981 return ret;
982}
983
984template <typename ParamsT, typename IndicesT = int32>
985inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
986 const RuntimeShape &indices_shape, const IndicesT *indices_data,
987 const RuntimeShape &output_shape, ParamsT *output_data)
988{
989 ruy::profiler::ScopeLabel label("GatherNd");
990
991 const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
992 for (int i = 0; i < res.n_slices; ++i)
993 {
994 int from_pos = 0;
995 for (int j = 0; j < res.indices_nd; ++j)
996 {
997 from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
998 }
999 std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
1000 sizeof(ParamsT) * res.slice_size);
1001 }
1002}
1003
1004#ifndef TF_LITE_STATIC_MEMORY
1005template <typename IndicesT = int32>
1006inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
1007 const RuntimeShape &indices_shape, const IndicesT *indices_data,
1008 const RuntimeShape &output_shape, TfLiteTensor *output_data)
1009{
1010 ruy::profiler::ScopeLabel label("GatherNdString");
1011
1012 const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
1013 DynamicBuffer buffer;
1014 for (int i = 0; i < res.n_slices; ++i)
1015 {
1016 int from_pos = 0;
1017 for (int j = 0; j < res.indices_nd; ++j)
1018 {
1019 from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
1020 }
1021 for (int j = 0; j < res.slice_size; ++j)
1022 {
1023 buffer.AddString(GetString(params_data, from_pos + j));
1024 }
1025 }
1026 buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
1027}
1028#endif
1029
1030template <typename IndicesT, typename UpdatesT>
1031inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
1032 const RuntimeShape &updates_shape, const UpdatesT *updates_data,
1033 const RuntimeShape &output_shape, UpdatesT *output_data)
1034{
1035 ruy::profiler::ScopeLabel label("ScatterNd");
1036
1037 int n_slices = 1;
1038 int slice_size = 1;
1039 const int outer_dims = indices_shape.DimensionsCount() - 1;
1040 const int indices_nd = indices_shape.Dims(outer_dims);
1041 const int updates_dims = updates_shape.DimensionsCount();
1042 for (int i = 0; i < outer_dims; ++i)
1043 {
1044 n_slices *= indices_shape.Dims(i);
1045 }
1046 for (int i = outer_dims; i < updates_dims; ++i)
1047 {
1048 slice_size *= updates_shape.Dims(i);
1049 }
1050
1051 int output_flat_size = output_shape.FlatSize();
1052 int remain_flat_size = output_flat_size;
1053 std::vector<int> dims_to_count(indices_nd, 0);
1054 for (int i = 0; i < indices_nd; ++i)
1055 {
1056 dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
1057 remain_flat_size = dims_to_count[i];
1058 }
1059
1060 memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
1061 for (int i = 0; i < n_slices; ++i)
1062 {
1063 int to_pos = 0;
1064 for (int j = 0; j < indices_nd; ++j)
1065 {
1066 IndicesT idx = indices_data[i * indices_nd + j];
1067 TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
1068 to_pos += idx * dims_to_count[j];
1069 }
1070 for (int j = 0; j < slice_size; j++)
1071 {
1072 output_data[to_pos + j] += updates_data[i * slice_size + j];
1073 }
1074 }
1075}
1076
1077template <typename T>
1078inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
1079 const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
1080{
1081 const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
1082 TFLITE_DCHECK_LE(op_params.begin_count, 5);
1083 TFLITE_DCHECK_LE(op_params.size_count, 5);
1084 const int begin_count = op_params.begin_count;
1085 const int size_count = op_params.size_count;
1086 // We front-pad the begin and size vectors.
1087 std::array<int, 5> start;
1088 std::array<int, 5> stop;
1089 for (int i = 0; i < 5; ++i)
1090 {
1091 int padded_i = 5 - i;
1092 start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
1093 stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
1094 ? ext_shape.Dims(i)
1095 : start[i] + op_params.size[size_count - padded_i];
1096 }
1097
1098 for (int i0 = start[0]; i0 < stop[0]; ++i0)
1099 {
1100 for (int i1 = start[1]; i1 < stop[1]; ++i1)
1101 {
1102 for (int i2 = start[2]; i2 < stop[2]; ++i2)
1103 {
1104 for (int i3 = start[3]; i3 < stop[3]; ++i3)
1105 {
1106 for (int i4 = start[4]; i4 < stop[4]; ++i4)
1107 {
1108 writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
1109 }
1110 }
1111 }
1112 }
1113 }
1114}
1115
1116template <typename T>
1117inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
1118 const T *input_data, const RuntimeShape &output_shape, T *output_data)
1119{
1120 SequentialTensorWriter<T> writer(input_data, output_data);
1121 return Slice(op_params, input_shape, output_shape, &writer);
1122}
1123
1124template <typename T>
1125inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
1126 const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
1127{
1128 SequentialTensorWriter<T> writer(input, output);
1129 return Slice(op_params, input_shape, output_shape, &writer);
1130}
1131
1132template <typename T>
1133void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
1134 const RuntimeShape &output_shape, T *output_data)
1135{
1136 const int flat_size = MatchingFlatSize(input1_shape, output_shape);
1137
1138 auto min_value = input2_data[0];
1139 for (int i = 0; i < flat_size; i++)
1140 {
1141 output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
1142 }
1143}
1144
1145// Convenience version that allows, for example, generated-code calls to be
1146// the same as other binary ops.
1147template <typename T>
1148inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
1149 const T *input2_data, const RuntimeShape &output_shape, T *output_data)
1150{
1151 // Drop shape of second input: not needed.
1152 Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
1153}
1154
1155template <typename T>
1156void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
1157 const RuntimeShape &output_shape, T *output_data)
1158{
1159 const int flat_size = MatchingFlatSize(input1_shape, output_shape);
1160
1161 auto max_value = input2_data[0];
1162 for (int i = 0; i < flat_size; i++)
1163 {
1164 output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
1165 }
1166}
1167
1168// Convenience version that allows, for example, generated-code calls to be
1169// the same as other binary ops.
1170template <typename T>
1171inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
1172 const T *input2_data, const RuntimeShape &output_shape, T *output_data)
1173{
1174 // Drop shape of second input: not needed.
1175 Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
1176}
1177
1178template <typename T1, typename T2, typename T3>
1179void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
1180 const RuntimeShape &output_shape, T2 *output_data)
1181{
1182 ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
1183}
1184
1185// Convenience version that allows, for example, generated-code calls to be
1186// the same as other binary ops.
1187template <typename T1, typename T2, typename T3>
1188inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
1189 const RuntimeShape &input2_shape, const T3 *input2_data,
1190 const RuntimeShape &output_shape, T2 *output_data)
1191{
1192 // Drop shape of second input: not needed.
1193 ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
1194}
1195
1196template <typename D, typename T>
1197void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
1198 const RuntimeShape &input_x_shape, const T *input_x_data,
1199 const RuntimeShape &input_y_shape, const T *input_y_data,
1200 const RuntimeShape &output_shape, T *output_data)
1201{
1202 int64_t flatsize;
1203 // Allow select operator executions on mixed scalar tensors and one element
1204 // tensors.
1205 if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
1206 input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
1207 {
1208 flatsize = 1;
1209 }
1210 else
1211 {
1212 flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
1213 }
1214 for (int64_t i = 0; i < flatsize; ++i)
1215 {
1216 output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
1217 }
1218}
1219
1220template <typename D, typename T>
1221void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
1222 const RuntimeShape &input_x_shape, const T *input_x_data,
1223 const RuntimeShape &input_y_shape, const T *input_y_data,
1224 const RuntimeShape &output_shape, T *output_data)
1225{
1226 const int64_t outer_size = input_condition_shape.FlatSize();
1227 int64_t inner_size;
1228 if (input_condition_shape.DimensionsCount() == 0)
1229 {
1230 inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
1231 }
1232 else
1233 {
1234 TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
1235 inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
1236 }
1237
1238 int64_t offset = 0;
1239 for (int64_t i = 0; i < outer_size; i++)
1240 {
1241 const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
1242 memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
1243 offset += inner_size;
1244 }
1245}
1246
1247template <typename D, typename T>
1248void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
1249 const RuntimeShape &input_x_shape, const T *input_x_data,
1250 const RuntimeShape &input_y_shape, const T *input_y_data,
1251 const RuntimeShape &output_shape, T *output_data)
1252{
1253 TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
1254 TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
1255 TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
1256 TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
1257
1258 const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
1259
1260 NdArrayDesc<4> desc_condition;
1261 NdArrayDesc<4> desc_x;
1262 NdArrayDesc<4> desc_y;
1263 NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
1264 &desc_condition, &desc_x, &desc_y);
1265
1266 // In Tensorflow, the dimensions are canonically named (batch_number, row,
1267 // col, channel), with extents (batches, height, width, depth), with the
1268 // trailing dimension changing most rapidly (channels has the smallest
1269 // stride, typically 1 element).
1270 //
1271 // In generated C code, we store arrays with the dimensions reversed. The
1272 // first dimension has smallest stride.
1273 //
1274 // We name our variables by their Tensorflow convention, but generate C code
1275 // nesting loops such that the innermost loop has the smallest stride for
1276 // the best cache behavior.
1277 for (int b = 0; b < extended_output_shape.Dims(0); ++b)
1278 {
1279 for (int y = 0; y < extended_output_shape.Dims(1); ++y)
1280 {
1281 for (int x = 0; x < extended_output_shape.Dims(2); ++x)
1282 {
1283 for (int c = 0; c < extended_output_shape.Dims(3); ++c)
1284 {
1285 const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
1286 const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
1287 const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
1288 output_data[Offset(extended_output_shape, b, y, x, c)] =
1289 input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
1290 }
1291 }
1292 }
1293 }
1294}
1295
1296template <typename D, typename T>
1297void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
1298 T *output_data)
1299{
1300 const size_t size = input_condition_shape.FlatSize();
1301 if (size == 0)
1302 {
1303 // Dimension is zero, in which case we don't need to output.
1304 return;
1305 }
1306 const size_t cond_rank = input_condition_shape.DimensionsCount();
1307
1308 std::vector<int> dims_to_count(cond_rank, 0);
1309 int cur_flat_size = size;
1310 for (int i = 0; i < cond_rank; ++i)
1311 {
1312 dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
1313 cur_flat_size = dims_to_count[i];
1314 }
1315
1316 int output_index = 0;
1317 for (int i = 0; i < size; ++i)
1318 {
1319 if (input_condition_data[i])
1320 {
1321 // Insert the coordinate of the current item (row major) into output.
1322 int flat_index = i;
1323 for (int j = 0; j < cond_rank; ++j)
1324 {
1325 int coord_j = flat_index / dims_to_count[j];
1326 output_data[output_index * cond_rank + j] = coord_j;
1327 flat_index %= dims_to_count[j];
1328 }
1329 output_index++;
1330 }
1331 }
1332}
1333
1334// For easy implementation, the indices is always a vector of size-4 vectors.
1335template <typename T, typename TI>
1336inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
1337 T default_value, bool value_is_scalar,
1338 const RuntimeShape &unextended_output_shape, T *output_data)
1339{
1340 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
1341 const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
1342 const int value_count = indices.size();
1343
1344 // First fill the output_data with default value.
1345 const int num_elements = output_shape.FlatSize();
1346 for (int i = 0; i < num_elements; ++i)
1347 {
1348 output_data[i] = default_value;
1349 }
1350
1351 // Special handle for value is scalar case to avoid checking the boolean
1352 // condition within the loop every time.
1353 if (value_is_scalar)
1354 {
1355 for (int i = 0; i < value_count; ++i)
1356 {
1357 const std::vector<TI> &index = indices[i];
1358 TFLITE_DCHECK_EQ(index.size(), 4);
1359 const T value = *values; // just use the first value.
1360 output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
1361 }
1362 return;
1363 }
1364
1365 // Go through the values and indices to fill the sparse values.
1366 for (int i = 0; i < value_count; ++i)
1367 {
1368 const std::vector<TI> &index = indices[i];
1369 TFLITE_DCHECK_EQ(index.size(), 4);
1370 const T value = values[i];
1371 output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
1372 }
1373}
1374
1375template <typename T>
1376inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
1377 const RuntimeShape &input2_shape, const T *input2_data,
1378 const RuntimeShape &output_shape, T *output_data)
1379{
1380 const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
1381 for (int i = 0; i < flat_size; ++i)
1382 {
1383 output_data[i] = std::pow(input1_data[i], input2_data[i]);
1384 }
1385}
1386
1387template <typename T>
1388inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
1389 const RuntimeShape &unextended_input2_shape, const T *input2_data,
1390 const RuntimeShape &unextended_output_shape, T *output_data)
1391{
1392 TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
1393 TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
1394 TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
1395 const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
1396
1399 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
1400 &desc2);
1401
1402 for (int b = 0; b < output_shape.Dims(0); ++b)
1403 {
1404 for (int y = 0; y < output_shape.Dims(1); ++y)
1405 {
1406 for (int x = 0; x < output_shape.Dims(2); ++x)
1407 {
1408 for (int c = 0; c < output_shape.Dims(3); ++c)
1409 {
1410 auto out_idx = Offset(output_shape, b, y, x, c);
1411 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
1412 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
1413 auto in1_val = input1_data[in1_idx];
1414 auto in2_val = input2_data[in2_idx];
1415 output_data[out_idx] = std::pow(in1_val, in2_val);
1416 }
1417 }
1418 }
1419 }
1420}
1421
1422template <typename Scalar>
1423void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
1424 const RuntimeShape &output_shape, Scalar *output_data)
1425{
1426 ruy::profiler::ScopeLabel label("Reverse");
1427
1428 int outer_size = 1;
1429 for (int i = 0; i < axis; ++i)
1430 {
1431 outer_size *= input_shape.Dims(i);
1432 }
1433
1434 int copy_size = 1;
1435 for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
1436 {
1437 copy_size *= input_shape.Dims(i);
1438 }
1439
1440 const int dims_at_axis = input_shape.Dims(axis);
1441 for (int i = 0; i < outer_size; ++i)
1442 {
1443 for (int j = 0; j < dims_at_axis; ++j)
1444 {
1445 const int start_pos = (i * dims_at_axis + j) * copy_size;
1446 Scalar *output_ptr = output_data + start_pos;
1447 int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
1448 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
1449 }
1450 }
1451}
1452
1453template <typename Scalar, typename TS>
1454void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
1455 const RuntimeShape &input_shape, const Scalar *input_data,
1456 const RuntimeShape &output_shape, Scalar *output_data)
1457{
1458 ruy::profiler::ScopeLabel label("ReverseSequence");
1459
1460 int outer_size = 1;
1461 int outer_dim = std::min(batch_dim, seq_dim);
1462 int medium_dim = std::max(batch_dim, seq_dim);
1463 for (int i = 0; i < outer_dim; ++i)
1464 {
1465 outer_size *= input_shape.Dims(i);
1466 }
1467
1468 int medium_size = 1;
1469 for (int i = outer_dim + 1; i < medium_dim; ++i)
1470 {
1471 medium_size *= input_shape.Dims(i);
1472 }
1473
1474 int copy_size = 1;
1475 for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
1476 {
1477 copy_size *= input_shape.Dims(i);
1478 }
1479
1480 const int dims_at_outer_dim = input_shape.Dims(outer_dim);
1481 const int dims_at_medium_dim = input_shape.Dims(medium_dim);
1482
1483 Scalar *output_ptr;
1484 if (batch_dim > seq_dim)
1485 {
1486 for (int i = 0; i < outer_size; ++i)
1487 {
1488 for (int j = 0; j < dims_at_outer_dim; ++j)
1489 {
1490 const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
1491 for (int p = 0; p < medium_size; ++p)
1492 {
1493 for (int q = 0; q < dims_at_medium_dim; ++q)
1494 {
1495 const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
1496 const Scalar *in_ptr = input_data + in_pos;
1497 int sl = seq_lengths[q] - 1;
1498 if (j > sl)
1499 {
1500 output_ptr = output_data + in_pos;
1501 }
1502 else
1503 {
1504 const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
1505 const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
1506 output_ptr = output_data + out_pos;
1507 }
1508 memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
1509 }
1510 }
1511 }
1512 }
1513 }
1514 else if (batch_dim < seq_dim)
1515 {
1516 for (int i = 0; i < outer_size; ++i)
1517 {
1518 for (int j = 0; j < dims_at_outer_dim; ++j)
1519 {
1520 const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
1521 int sl = seq_lengths[j] - 1;
1522 const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
1523 for (int p = 0; p < medium_size; ++p)
1524 {
1525 for (int q = 0; q < dims_at_medium_dim; ++q)
1526 {
1527 const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
1528 const Scalar *in_ptr = input_data + in_pos;
1529 if (q > sl)
1530 {
1531 output_ptr = output_data + in_pos;
1532 }
1533 else
1534 {
1535 const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
1536 output_ptr = output_data + out_pos;
1537 }
1538 memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
1539 }
1540 }
1541 }
1542 }
1543 }
1544}
1545
1546template <typename T>
1547inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
1548 const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
1549 const RuntimeShape &output_shape, T *output_data)
1550{
1551 const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
1552
1553 memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
1554
1555 for (int i = 0; i < input_shape.Dims(0); i++)
1556 {
1557 int output_index = segment_ids_data[i];
1558 for (int j = 0; j < segment_flat_size; ++j)
1559 {
1560 output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
1561 }
1562 }
1563}
1564
1565} // namespace reference_ops
1566} // namespace tflite
1567
1568#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
void Concatenation(int concat_dim, const Scalar *const *input_data, const Dims< 4 > *const *input_dims, int inputs_count, Scalar *output_data, const Dims< 4 > &output_dims)
void FullyConnected(const float *input_data, const Dims< 4 > &input_dims, const float *weights_data, const Dims< 4 > &weights_dims, const float *bias_data, const Dims< 4 > &bias_dims, float *output_data, const Dims< 4 > &output_dims)
void NdArrayDescsForElementwiseBroadcast(const Dims< N > &input0_dims, const Dims< N > &input1_dims, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
Definition NDArray.h:89
int SubscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
Definition NDArray.h:54
int Offset(const Dims< 4 > &dims, int i0, int i1, int i2, int i3)
Definition Dims.h:64
int MatchingFlatSize(const Dims< N > &dims, const Dims< N > &check_dims_0)
Definition Dims.h:108
std::uint8_t uint8
Definition Macro.h:52
std::int16_t int16
Definition Macro.h:53
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
NdArrayDesc< 4 > desc1
const luci_interpreter::RuntimeShape output_shape
NdArrayDesc< 4 > desc2
void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data, const RuntimeShape &indices_shape, const IndicesT *indices_data, const RuntimeShape &output_shape, ParamsT *output_data)
void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params, const RuntimeShape &input_shape, const float *input_data, const RuntimeShape &output_shape, float *output_data)
void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape, const float *input_data, const RuntimeShape &output_shape, float *output_data)
void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape, const int16_t *input1_data, const RuntimeShape &input2_shape, const int16_t *input2_data, const RuntimeShape &output_shape, int16_t *output_data)
void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape, const float *input_data, const RuntimeShape &unextended_prev_activ_shape, const float *prev_activ_data, const RuntimeShape &weights_shape, const float *weights_data, const RuntimeShape &unextended_bias_shape, const float *bias_data, const RuntimeShape &unextended_prev_state_shape, const float *prev_state_data, const RuntimeShape &unextended_output_state_shape, float *output_state_data, const RuntimeShape &unextended_output_activ_shape, float *output_activ_data, const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data, const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
void Relu1(const RuntimeShape &input_shape, const T *input_data, const RuntimeShape &output_shape, T *output_data)
void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data, const RuntimeShape &output_shape, float *output_data)
GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape, const RuntimeShape &indices_shape)
void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data, const RuntimeShape &indices_shape, const IndicesT *indices_data, const RuntimeShape &output_shape, TfLiteTensor *output_data)
void SparseToDense(const std::vector< std::vector< TI > > &indices, const T *values, T default_value, bool value_is_scalar, const RuntimeShape &unextended_output_shape, T *output_data)
void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data, const RuntimeShape &output_shape, T2 *output_data)
void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape, const T *input_data, const RuntimeShape &output_shape, T *output_data)
void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data, const RuntimeShape &input_x_shape, const T *input_x_data, const RuntimeShape &input_y_shape, const T *input_y_data, const RuntimeShape &output_shape, T *output_data)
void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data, const RuntimeShape &unextended_input2_shape, const T *input2_data, const RuntimeShape &unextended_output_shape, T *output_data)
void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim, const RuntimeShape &input_shape, const Scalar *input_data, const RuntimeShape &output_shape, Scalar *output_data)
void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data, const RuntimeShape &input_x_shape, const T *input_x_data, const RuntimeShape &input_y_shape, const T *input_y_data, const RuntimeShape &output_shape, T *output_data)
void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data, const RuntimeShape &output_shape, T *output_data)
void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data, const RuntimeShape &input_x_shape, const T *input_x_data, const RuntimeShape &input_y_shape, const T *input_y_data, const RuntimeShape &output_shape, T *output_data)
void Pow(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &input2_shape, const T *input2_data, const RuntimeShape &output_shape, T *output_data)
void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape, const int16 *input1_data, const RuntimeShape &input2_shape, const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data, T *output_data)
void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape, const RuntimeShape &output_shape, SequentialTensorWriter< T > *writer)
void SegmentSum(const RuntimeShape &input_shape, const T *input_data, const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data, const RuntimeShape &output_shape, T *output_data)
void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes, const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes, const uint8 *const *input_data, const RuntimeShape &output_shape, uint8 *output_data)
void Pack(const PackParams &params, const RuntimeShape *const *input_shapes, const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
void BroadcastMulFivefold(const ArithmeticParams &unswitched_params, const RuntimeShape &unswitched_input1_shape, const uint8 *unswitched_input1_data, const RuntimeShape &unswitched_input2_shape, const uint8 *unswitched_input2_data, const RuntimeShape &output_shape, uint8 *output_data)
int NodeOffset(int b, int h, int w, int height, int width)
void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data, const RuntimeShape &output_shape, Scalar *output_data)
void Relu6(const RuntimeShape &input_shape, const float *input_data, const RuntimeShape &output_shape, float *output_data)
void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data, const RuntimeShape &output_shape, Scalar *const *output_datas)
void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data, const RuntimeShape *const *output_shapes, Scalar *const *output_data)
void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data, const RuntimeShape &output_shape, T *output_data)
void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data, const RuntimeShape &updates_shape, const UpdatesT *updates_data, const RuntimeShape &output_shape, UpdatesT *output_data)
int32_t size[5]
Definition Slice.cpp:35
int8_t size_count
Definition Slice.cpp:34
int8_t begin_count
Definition Slice.cpp:32
int32_t int32
Definition topk_v2.h:27