Namespaces
namespace	lstm

namespace	lstm_internal

Data Structures
struct	AddFn

struct	ArithmeticParams

struct	ComparisonParams

struct	ConcatenationParams

struct	ConvParams

struct	DivFn

struct	FloorDivFn

struct	FloorModFn

struct	FullyConnectedParams

struct	MaximumFn

struct	MeanParams

struct	MinimumFn

struct	MulFn

struct	NdArrayDesc

struct	PaddingValues

struct	PadParams

struct	PoolParams

struct	PreluParams

struct	QuantizationParams

struct	ResizeNearestNeighborParams

struct	SoftmaxParams

struct	StridedSliceParams

struct	SubFn

struct	TransposeParams

Enumerations
enum class	PaddingType : uint8_t { None , Same , Valid }

enum class	BroadcastableOpCategory : uint8_t { kNone , kNonBroadcast , kFirstInputBroadcastsFast , kSecondInputBroadcastsFast , kGenericBroadcast , kScalarFirstBroadcast , kScalarSecondBroadcast }

enum class	FusedActivationFunctionType : uint8_t { kNone , kRelu6 , kRelu1 , kRelu }

Functions
template<>
void	AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

template<>
void	DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t output_multiplier, const int32_t output_shift, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

template<>
void	FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data)

template<>
void	Softmax< int8_t > (const tflite::SoftmaxParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &output_shape, int8_t output_data)

template<>
void	AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

void	BatchMatMul (const tflite::RuntimeShape &lhs_shape, const float lhs_data, const tflite::RuntimeShape &rhs_shape, const float rhs_data, const tflite::RuntimeShape &output_shape, float *output_data)

template<>
void	DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t output_multiplier, const int32_t output_shift, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

template<>
void	FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data)

void	Logistic (const int flat_size, const float input_data, float output_data)

void	calculateGRU (const float input_data, const float weight_input_data, const float weight_hidden_data, const float bias_input_data, const float bias_hidden_data, float output_data, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, const tflite::RuntimeShape &weight_hidden_shape, float output_input_data, float output_hidden_data, const tflite::RuntimeShape &output_shape_fc)

void	GRU (const float input_data, const float weight_input_data, const float weight_hidden_data, const float bias_input_data, const float bias_hidden_data, const float hidden_state_data, float output_data, float output_input_data, float *output_hidden_data, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, const tflite::RuntimeShape &weight_hidden_shape)

template<>
void	Mul (tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape, const int64_t input1_data, const tflite::RuntimeShape &input2_shape, const int64_t input2_data, const tflite::RuntimeShape &output_shape, int64_t *output_data)

template<>
void	AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

template<>
void	DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t output_multiplier, const int32_t output_shift, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)

template<>
void	FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t input_data, const tflite::RuntimeShape &filter_shape, const int8_t filter_data, const tflite::RuntimeShape &bias_shape, const int32_t bias_data, const tflite::RuntimeShape &output_shape, int8_t output_data)

template<>
void	Add< int8_t > (const ArithmeticParams &params, const int flat_size, const int8_t input1_data, const int8_t input2_data, int8_t *output_data)

template<>
void	Add< int16_t > (const ArithmeticParams &params, const int flat_size, const int16_t input1_data, const int16_t input2_data, int16_t *output_data)

void	AveragePool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const uint8_t input_data, const luci_interpreter::RuntimeShape &output_shape, uint8_t output_data, luci_interpreter::DataType data_type)

template<>
void	FullyConnected< int8_t > (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t , const int8_t input_data, const int32_t filter_shape, const int8_t filter_data, const int32_t bias_data, const int32_t output_shape, int8_t *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)

template<>
void	FullyConnected (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t , const int16_t input_data, const int32_t filter_shape, const int8_t filter_data, const int64_t bias_data, const int32_t output_shape, int16_t *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)

void	MaxPool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const uint8_t input_data, const luci_interpreter::RuntimeShape &output_shape, uint8_t output_data, luci_interpreter::DataType data_type)

template<>
void	Mul< int8_t > (const ArithmeticParams &params, const int flat_size, const int8_t input1_data, const int8_t input2_data, int8_t *output_data)

template<>
void	Mul< int16_t > (const ArithmeticParams &params, const int flat_size, const int16_t input1_data, const int16_t input2_data, int16_t *output_data)

void	Softmax (const SoftmaxParams &params, const int8_t input_data, int8_t output_data)

void	Softmax (const SoftmaxParams &params, const int8_t input_data, int16_t output_data)

void	Softmax (const SoftmaxParams &params, const int16_t input_data, int16_t output_data)

void	eval_integer_8x8_16_lstm (const luci_interpreter::Tensor input, const luci_interpreter::Tensor input_to_input_weights, const luci_interpreter::Tensor input_to_forget_weights, const luci_interpreter::Tensor input_to_cell_weights, const luci_interpreter::Tensor input_to_output_weights, const luci_interpreter::Tensor recurrent_to_input_weights, const luci_interpreter::Tensor recurrent_to_forget_weights, const luci_interpreter::Tensor recurrent_to_cell_weights, const luci_interpreter::Tensor recurrent_to_output_weights, const luci_interpreter::Tensor cell_to_input_weights, const luci_interpreter::Tensor cell_to_forget_weights, const luci_interpreter::Tensor cell_to_output_weights, const luci_interpreter::Tensor input_layer_norm_coefficients, const luci_interpreter::Tensor forget_layer_norm_coefficients, const luci_interpreter::Tensor cell_layer_norm_coefficients, const luci_interpreter::Tensor output_layer_norm_coefficients, const luci_interpreter::Tensor input_gate_bias, const luci_interpreter::Tensor forget_gate_bias, const luci_interpreter::Tensor cell_gate_bias, const luci_interpreter::Tensor output_gate_bias, const luci_interpreter::Tensor projection_weights, const luci_interpreter::Tensor projection_bias, const luci_interpreter::UnidirectionalSequenceLSTMParams &params, bool forward_sequence, bool time_major, const luci_interpreter::IntegerLSTMParams &integer_lstm_param, int32_t output_state_zp, luci_interpreter::Tensor output_state, luci_interpreter::Tensor cell_state, luci_interpreter::Tensor output, int16_t scratch0, int16_t scratch1, int16_t scratch2, int16_t scratch3, int8_t scratch4, int32_t *scratch5)

template<typename T >
void	BroadcastTISO4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data, std::function< const T &(const T &, const T &)> func)

void	Abs (const int flat_size, const float input_data, float output_data)

template<typename T >
void	Add (const ArithmeticParams &params, const int flat_size, const T input1_data, const T input2_data, T *output_data)

template<typename T >
void	BroadcastAdd4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

template<typename T >
void	AddN (const size_t flat_size, const size_t num_inputs, const T const input_data, T *output_data)

template<typename T1 , typename T2 , typename T3 , typename Cmp >
void	ArgMinMax (const luci_interpreter::RuntimeShape &input1_shape, const T1 input1_data, const T3 input2_data, const luci_interpreter::RuntimeShape &output_shape, T2 *output_data, const Cmp &cmp)

template<typename T , typename Fn >
void	ArithmeticOp (const ArithmeticParams &params, const int flat_size, const T input1_data, const T input2_data, T *output_data)

template<typename T , typename Fn >
void	ArithmeticOpScalar (const ArithmeticParams &params, const int flat_size, const T input_data, const T scalar_value, T output_data)

template<typename T , typename Fn >
void	BroadcastArithmeticOp4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

void	AveragePool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

template<typename T >
void	BatchToSpaceND (const luci_interpreter::RuntimeShape &unextended_input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const int32_t block_shape_data, const luci_interpreter::RuntimeShape &unextended_input3_shape, const int32_t crops_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

template<typename T , typename Fn >
void	BinaryOp (const int flat_size, const T input1_data, const T input2_data, T *output_data)

template<typename T , typename Fn >
void	BroadcastBinaryOp4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)

template<int N>
void	BroadcastImpl (const NdArrayDesc< N > &input_desc, const uint8_t input_data, const NdArrayDesc< N > &output_desc, uint8_t output_data, int indexes[N], int dim, const int last_broadcasting_dim, const uint32_t type_size)

template<int N>
void	BroadcastTo (const luci_interpreter::RuntimeShape &unextended_input_shape, const uint8_t input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, uint8_t output_data, luci_interpreter::DataType data_type)

void	Ceil (const int32_t flat_size, const float input_data, float output_data)

template<typename T >
bool	LessFn (T lhs, T rhs)

template<typename T >
bool	LessEqualFn (T lhs, T rhs)

template<typename T >
bool	EqualFn (T lhs, T rhs)

template<typename T >
bool	GreaterFn (T lhs, T rhs)

template<typename T >
bool	GreaterEqualFn (T lhs, T rhs)

template<typename T >
bool	NotEqualFn (T lhs, T rhs)

template<typename T >
void	ComparisonNoScaling (const int64_t flat_size, const T input1_data, const T input2_data, bool *output_data, bool F(T, T))

template<typename T >
void	BroadcastComparison4DSlowWithScaling (const ComparisonParams &op_params, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, bool *output_data, bool F(T, T))

template<typename T >
void	ComparisonWithScaling (const ComparisonParams &op_params, const int64_t flat_size, const T input1_data, const T input2_data, bool *output_data, bool F(T, T))

template<typename T >
void	BroadcastComparison4DSlowNoScaling (const ComparisonParams &op_params, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, bool *output_data, bool F(T, T))

template<typename Scalar >
void	Concatenation (const ConcatenationParams &params, const luci_interpreter::RuntimeShape const input_shapes, const Scalar const input_data, const luci_interpreter::RuntimeShape &output_shape, Scalar *output_data)

void	Cos (const int flat_size, const float input_data, float output_data)

template<typename T >
void	DepthToSpace (const int32_t block_size, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

template<typename InputT , typename OutputT >
void	Dequantize (const QuantizationParams &op_params, const int flat_size, const InputT input_data, OutputT output_data)

template<typename T >
void	Div (const ArithmeticParams &params, const int flat_size, const T input1_data, const T input2_data, T *output_data)

template<typename T >
void	DivScalar (const ArithmeticParams &params, const int flat_size, const T input_data, const T scalar_value, T output_data)

template<typename T >
void	BroadcastDiv4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

void	Elu (const int flat_size, const float input_data, float output_data)

void	Exp (const int flat_size, const float input_data, float output_data)

void	Floor (const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

void	FloorDiv (const int flat_size, const float input1_data, const float input2_data, float *output_data)

void	BroadcastFloorDiv4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)

void	FloorMod (const int flat_size, const float input1_data, const float input2_data, float *output_data)

void	BroadcastFloorMod4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)

template<typename InputType , typename WeightType , typename OutputType , typename BiasType >
void	FullyConnected (const FullyConnectedParams &params, const int32_t input_shape, const InputType input_data, const int32_t filter_shape, const WeightType filter_data, const BiasType bias_data, const int32_t output_shape, OutputType *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)

template<typename WeightType >
void	FullyConnected (const FullyConnectedParams &params, const int32_t input_shape, const float input_data, const int32_t filter_shape, const WeightType filter_data, const float bias_data, const int32_t output_shape, float *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)

template<typename ParamsT , typename IndicesT >
void	GatherND (luci_interpreter::RuntimeShape params_shape, const ParamsT param_data, luci_interpreter::RuntimeShape indices_shape, const IndicesT index_data, ParamsT *output_data)

void	L2Normalization (const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data, float epsilon=1e-6)

void	L2Pool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

void	Log (const int flat_size, const float input_data, float output_data)

void	LogicalCommon (const int flat_size, const bool input1_data, const bool input2_data, bool output_data, bool(f)(bool, bool))

void	LogicalNot (const int flat_size, const bool input_data, bool output_data)

void	Logistic (const int flat_size, const int8_t input_data, float input_scale, int input_zero_point, int8_t output_data, float output_scale, int output_zero_point)

void	Logistic (int32_t input_multiplier, int32_t input_left_shift, int32_t input_size, const int16_t ptr_input_data, int16_t ptr_output_data)

void	LogSoftmax (const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

void	Maximum (const int flat_size, const float input1_data, const float input2_data, float *output_data)

void	BroadcastMaximum4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)

void	MaxPool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

template<typename T , typename U >
bool	Mean (const T input_data, const int input_dims, const int input_num_dims, T output_data, const int output_dims, const int output_num_dims, const int axis, const int num_axis_dimensions, bool, int temp_index, int resolved_axis, U temp_sum)

void	Mean (const MeanParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const float input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, float output_data)

void	Minimum (const int flat_size, const float input1_data, const float input2_data, float *output_data)

template<typename T >
void	BroadcastMinimum4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

template<typename T >
void	MirrorPad (const luci_interpreter::DataType padding_matrix_type, const uint8_t padding_matrix_data, const int32_t input_dims, int output_dims_num_elements, int input_dims_num_elements, const T input_data, T output_data, const int offset, const int num_dims, const int output_size)

template<typename T >
void	Mul (const ArithmeticParams &params, const int flat_size, const T input1_data, const T input2_data, T *output_data)

template<typename T >
void	MulScalar (const ArithmeticParams &params, const int flat_size, const T input_data, const T scalar_value, T output_data)

template<typename T >
void	BroadcastMul4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

template<typename T >
void	Negate (const luci_interpreter::RuntimeShape &input_shape, const T input_data, const luci_interpreter::RuntimeShape &output_shape, T output_data)

constexpr int	PadKernelMaxDimensionCount ()

void	Pad (const PadParams &op_params, const luci_interpreter::RuntimeShape &input_shape, const float input_data, const float pad_value_ptr, const luci_interpreter::RuntimeShape &output_shape, float *output_data)

void	BroadcastPrelu4DSlowFloat (const luci_interpreter::RuntimeShape &unextended_input1_shape, const float input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const float input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, float *output_data)

template<typename InputT , typename OutputT >
void	Quantize (const QuantizationParams &op_params, const int flat_size, const InputT input_data, OutputT output_data)

template<typename T >
void	ReduceGeneric (const T input_data, const int input_dims, const int input_num_dims, T output_data, const int axis, const int64_t num_axis_dimensions, T init_value, const int output_flat_size, T reducer(const T, const T))

void	ReLUCommon (const int flat_size, const float input_data, float output_data, const float alpha, const bool is_relu_6)

int	Offset (const luci_interpreter::RuntimeShape &shape, int i0, int i1, int i2, int i3)

void	ComputeInterpolationValues (const float value, const float scale, const bool half_pixel_centers, int32_t input_size, float scaled_value, int32_t lower_bound, int32_t *upper_bound)

int32_t	getNearestNeighbor (const int input_value, const int32_t input_size, const int32_t output_size, const bool align_corners, const bool half_pixel_centers)

template<typename T >
void	ResizeNearestNeighbor (const ResizeNearestNeighborParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, const luci_interpreter::RuntimeShape &output_size_shape, const int32_t output_size_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)

float	RoundToNearest (float value)

void	Round (const int32_t flat_size, const float input_data, float output_data)

void	Rsqrt (const int flat_size, const float input_data, float output_data)

template<typename D , typename T >
void	Select (const luci_interpreter::RuntimeShape &input_condition_shape, const D input_condition_data, const luci_interpreter::RuntimeShape &input_x_shape, const T input_x_data, const luci_interpreter::RuntimeShape &input_y_shape, const T input_y_data, const luci_interpreter::RuntimeShape &output_shape, T output_data)

void	Sin (const int flat_size, const float input_data, float output_data)

void	Softmax (const SoftmaxParams &params, const float input_data, float output_data)

template<typename T >
void	SpaceToBatchND (const int32_t pad_value, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const int32_t block_shape_data, const luci_interpreter::RuntimeShape &unextended_input3_shape, const int32_t paddings_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

template<typename T >
void	SpaceToDepth (const int32_t block_size, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

void	Sqrt (const int flat_size, const float input_data, float output_data)

void	Square (const int flat_size, const float input_data, float output_data)

void	SquaredDifference (const int flat_size, const float input_data_1, const float input_data_2, float *output_data)

template<typename T >
void	StridedSlice (StridedSliceParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, T output_data)

template<typename T >
void	BroadcastSub4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)

void	SVDF (const float input_data, const float weights_feature_data, const float weights_time_data, const float bias_data, float state_data, float scratch_data, float *output_data, const int rank, const int input_size, const int batch_size, const int num_filters, const int num_units, const int memory_size, const circle::ActivationFunctionType activation)

void	Tanh (const int flat_size, const float input_data, float output_data)

void	Tanh (int32_t input_multiplier, int32_t input_left_shift, const int flat_size, const int16_t ptr_input_data, int16_t ptr_output_data)

template<typename T , int N>
void	TransposeImpl (const TransposeParams &params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

template<typename T , int N = 5>
void	Transpose (const TransposeParams &params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T output_data)

void	TransposeConv (const ConvParams &params, const luci_interpreter::RuntimeShape &input_shape, const float input_data, const luci_interpreter::RuntimeShape &filter_shape, const float filter_data, const luci_interpreter::RuntimeShape &bias_shape, const float bias_data, const luci_interpreter::RuntimeShape &output_shape, float output_data)

template<typename ActivationType , typename WeightType , typename CellType , typename BiasType >
void	evalLSTM (luci_interpreter::lstm::LSTMStruct lstm_struct, luci_interpreter::lstm::LSTMParameters lstm_params, luci_interpreter::lstm::CellStateInfo cell_state_info, ActivationType output_state_data, CellType cell_state_data, CellType scratch0, CellType scratch1, CellType scratch2, CellType scratch3, luci_interpreter::BaseRuntimeGraph runtime_graph)

std::int32_t	saturatingRoundingDoublingHighMul (std::int32_t a, std::int32_t b)

int32_t	roundingDivideByPOT (int32_t x, int32_t exponent)

int32_t	multiplyByQuantizedMultiplier (int32_t x, int32_t quantized_multiplier, int shift)

int32_t	multiplyByQuantizedMultiplierSmallerThanOneExp (int32_t x, int32_t quantized_multiplier, int left_shift)

template<typename P >
void	getActivationParams (const P &params, int32_t min, int32_t max)

template<typename P >
void	getActivationParams (const P &params, float min, float max)

template<typename P >
void	getActivationParams (const P &params, int64_t min, int64_t max)

size_t	reducedOutputOffset (const int num_dims, const int dims, const int index, const int num_axis, const int *axis)

bool	nextIndex (const int num_dims, const int dims, int current)

int	MatchingDim (const luci_interpreter::RuntimeShape &shape1, int index1, const luci_interpreter::RuntimeShape &shape2, int index2)

int	flatSizeSkipDim (const int32_t *dims_data, int skip_dim, int num_dims)

int	offset (const int32_t *dims_data, int i0, int i1, int i2, int i3)

int	offset (const int32_t *dims_data, int i0, int i1, int i2, int i3, int i4)

template<typename T >
T	activationFunctionWithMinMax (T x, T output_activation_min, T output_activation_max)

template<int N>
void	copyDimsToDesc (const luci_interpreter::RuntimeShape &input_shape, NdArrayDesc< N > *desc_out)

template<int N, int DIM, typename Calc >
std::enable_if< DIM==N-1, void >::type	NDOpsHelperImpl (const NdArrayDesc< N > &output, const Calc &calc, int indexes[N])

template<int N, int DIM, typename Calc >
std::enable_if< DIM!=N-1, void >::type	NDOpsHelperImpl (const NdArrayDesc< N > &output, const Calc &calc, int indexes[N])

template<int N, typename Calc >
void	NDOpsHelper (const NdArrayDesc< N > &output, const Calc &calc)

template<int N>
void	NdArrayDescsForElementwiseBroadcast (const luci_interpreter::RuntimeShape &input0_shape, const luci_interpreter::RuntimeShape &input1_shape, NdArrayDesc< N > desc0_out, NdArrayDesc< N > desc1_out)

int	subscriptToIndex (const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)

int	subscriptToIndex (const NdArrayDesc< 5 > &desc, int indexes[5])

bool	ProcessBroadcastShapes (const luci_interpreter::RuntimeShape &shape0, const luci_interpreter::RuntimeShape &shape1, luci_interpreter_pal::ArithmeticParams *params)

template<>
void	Add< int8_t > (const ArithmeticParams &, const int, const int8_t , const int8_t , int8_t *)

template<>
void	Add< int16_t > (const ArithmeticParams &, const int, const int16_t , const int16_t , int16_t *)

template<>
void	FullyConnected (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t input_shape, const int8_t input_data, const int32_t filter_shape, const int8_t filter_data, const int32_t bias_data, const int32_t output_shape, int8_t *output_data, uint32_t, uint32_t)

template<>
void	FullyConnected (const luci_interpreter_pal::FullyConnectedParams &, const int32_t , const int16_t , const int32_t , const int8_t , const int64_t , const int32_t , int16_t *, uint32_t, uint32_t)

template<>
void	Mul< int8_t > (const ArithmeticParams &, const int, const int8_t , const int8_t , int8_t *)

template<>
void	Mul< int16_t > (const ArithmeticParams &, const int, const int16_t , const int16_t , int16_t *)

template<>
void	evalLSTM< int8_t, int8_t, int16_t, int32_t > (luci_interpreter::lstm::LSTMStruct lstm_struct, luci_interpreter::lstm::LSTMParameters lstm_params, luci_interpreter::lstm::CellStateInfo cell_state_info, int8_t output_state_data, int16_t cell_state_data, int16_t scratch0, int16_t scratch1, int16_t scratch2, int16_t scratch3, luci_interpreter::BaseRuntimeGraph runtime_graph)

Variables
constexpr int	MAX_INDICES_ND = 5

Enumeration Type Documentation

◆ BroadcastableOpCategory

enum class luci_interpreter_pal::BroadcastableOpCategory : uint8_t

strong

Enumerator
kNone
kNonBroadcast
kFirstInputBroadcastsFast
kSecondInputBroadcastsFast
kGenericBroadcast
kScalarFirstBroadcast
kScalarSecondBroadcast

Definition at line 108 of file Params.h.

{
  kNone,
  kNonBroadcast,              // Matching input shapes.
  kFirstInputBroadcastsFast,  // Fivefold nested loops.
  kSecondInputBroadcastsFast, // Fivefold nested loops.
  kGenericBroadcast,          // Fall-back.
  kScalarFirstBroadcast,      // Scalar
  kScalarSecondBroadcast,     // Scalar
};

◆ FusedActivationFunctionType

enum class luci_interpreter_pal::FusedActivationFunctionType : uint8_t

strong

Enumerator
kNone
kRelu6
kRelu1
kRelu

Definition at line 215 of file Params.h.

{
  kNone,
  kRelu6,
  kRelu1,
  kRelu
};

◆ PaddingType

enum class luci_interpreter_pal::PaddingType : uint8_t

strong

Enumerator
None
Same
Valid

Definition at line 64 of file Params.h.

{
  None,
  Same,
  Valid
};

Function Documentation

◆ Abs()

void luci_interpreter_pal::Abs	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALAbs.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::abs(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleAbs().

◆ activationFunctionWithMinMax()

template<typename T >

T luci_interpreter_pal::activationFunctionWithMinMax	(	T	x,
		T	output_activation_min,
		T	output_activation_max
	)

inline

Definition at line 204 of file PALUtils.h.

{
  using std::max;
  using std::min;
  return min(max(x, output_activation_min), output_activation_max);
}

Referenced by L2Pool(), and TransposeConv().

◆ Add()

template<typename T >

void luci_interpreter_pal::Add	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 28 of file PALAddCommon.h.

{
  ArithmeticOp<T, AddFn<T>>(params, flat_size, input1_data, input2_data, output_data);
}

◆ Add< int16_t >() [1/2]

template<>

void luci_interpreter_pal::Add< int16_t >	(	const ArithmeticParams &	,
		const int	,
		const int16_t *	,
		const int16_t *	,
		int16_t *
	)

inline

Definition at line 33 of file PALAdd.h.

{
  assert(false && "Not IMPL yet");
}

◆ Add< int16_t >() [2/2]

template<>

void luci_interpreter_pal::Add< int16_t >	(	const ArithmeticParams &	params,
		const int	flat_size,
		const int16_t *	input1_data,
		const int16_t *	input2_data,
		int16_t *	output_data
	)

inline

Definition at line 39 of file PALAdd.h.

{
  auto status = arm_elementwise_add_s16(
    input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
    params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
    output_data, params.output_offset, params.output_multiplier, params.output_shift,
    params.quantized_activation_min, params.quantized_activation_max, flat_size);
  assert(status == ARM_CMSIS_NN_SUCCESS);
}

◆ Add< int8_t >() [1/2]

template<>

void luci_interpreter_pal::Add< int8_t >	(	const ArithmeticParams &	,
		const int	,
		const int8_t *	,
		const int8_t *	,
		int8_t *
	)

inline

Definition at line 26 of file PALAdd.h.

{
  assert(false && "Not IMPL yet");
}

◆ Add< int8_t >() [2/2]

template<>

void luci_interpreter_pal::Add< int8_t >	(	const ArithmeticParams &	params,
		const int	flat_size,
		const int8_t *	input1_data,
		const int8_t *	input2_data,
		int8_t *	output_data
	)

inline

Definition at line 27 of file PALAdd.h.

{
  auto status = arm_elementwise_add_s8(
    input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
    params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
    output_data, params.output_offset, params.output_multiplier, params.output_shift,
    params.quantized_activation_min, params.quantized_activation_max, flat_size);
  assert(status == ARM_CMSIS_NN_SUCCESS);
}

◆ AddN()

template<typename T >

void luci_interpreter_pal::AddN	(	const size_t	flat_size,
		const size_t	num_inputs,
		const T const	input_data,
		T *	output_data
	)

inline

Definition at line 29 of file PALAddN.h.

{
  // All inputs and output should have the same shape, this is checked during
  // Prepare stage.
  for (size_t i = 0; i < flat_size; ++i)
  {
    T x = 0;
    for (size_t j = 0; j < num_inputs; ++j)
    {
      x += input_data[j][i];
    }
    output_data[i] = x;
  }
}

◆ ArgMinMax()

template<typename T1 , typename T2 , typename T3 , typename Cmp >

void luci_interpreter_pal::ArgMinMax	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const T1 *	input1_data,
		const T3 *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T2 *	output_data,
		const Cmp &	cmp
	)

Definition at line 28 of file PALArgMinMax.h.

{
  int axis = input2_data[0];
  if (axis < 0)
  {
    axis += input1_shape.dimensionsCount();
  }
  const int axis_size = input1_shape.dims(axis);
 
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input1_shape.dims(i);
  }
 
  int inner_size = 1;
  const int dims_count = input1_shape.dimensionsCount();
  for (int i = axis + 1; i < dims_count; ++i)
  {
    inner_size *= input1_shape.dims(i);
  }
  for (int outer = 0; outer < outer_size; ++outer)
  {
    for (int inner = 0; inner < inner_size; ++inner)
    {
      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
      T2 min_max_index = 0;
      for (int i = 1; i < axis_size; ++i)
      {
        const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner];
        if (cmp(curr_value, min_max_value))
        {
          min_max_value = curr_value;
          min_max_index = static_cast<T2>(i);
        }
      }
      output_data[outer * inner_size + inner] = min_max_index;
    }
  }
}

References ArgMinMax(), luci_interpreter::RuntimeShape::dimensionsCount(), and luci_interpreter::RuntimeShape::dims().

Referenced by ArgMinMax().

◆ ArithmeticOp()

template<typename T , typename Fn >

void luci_interpreter_pal::ArithmeticOp	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 47 of file PALArithmeticOpCommon.h.

{
  T activation_min, activation_max;
  getActivationParams(params, &activation_min, &activation_max);
 
  Fn func;
  for (int i = 0; i < flat_size; ++i)
    output_data[i] =
      std::min(std::max(func(input1_data[i], input2_data[i]), activation_min), activation_max);
}

References getActivationParams().

◆ ArithmeticOpScalar()

template<typename T , typename Fn >

void luci_interpreter_pal::ArithmeticOpScalar	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input_data,
		const T	scalar_value,
		T *	output_data
	)

inline

Definition at line 60 of file PALArithmeticOpCommon.h.

{
  T activation_min, activation_max;
  getActivationParams(params, &activation_min, &activation_max);
 
  for (int i = 0; i < flat_size; ++i)
    output_data[i] =
      std::min(std::max(func(input_data[i], scalar_value), activation_min), activation_max);
}

References getActivationParams().

◆ AveragePool() [1/2]

void luci_interpreter_pal::AveragePool	(	const PoolParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 28 of file PALAveragePool2DCommon.h.

{
  const int batches = input_shape.dims(0);
  const int depth = output_shape.dims(3);
  const int input_height = input_shape.dims(1);
  const int input_width = input_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_width = output_shape.dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int out_y = 0; out_y < output_height; ++out_y)
    {
      for (int out_x = 0; out_x < output_width; ++out_x)
      {
        for (int channel = 0; channel < depth; ++channel)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          // Compute the boundaries of the filter region clamped so as to
          // ensure that the filter window fits in the input array.
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
 
          float total = 0.f;
          float filter_count = 0;
 
          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
          {
            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
            {
              const int in_x = in_x_origin + filter_x;
              const int in_y = in_y_origin + filter_y;
 
              const int input_data_offset =
                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
                  input_shape.dims(3) +
                channel;
 
              total += input_data[input_data_offset];
              filter_count++;
            }
          }
          const int output_data_offset =
            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
              output_shape.dims(3) +
            channel;
 
          assert(filter_count != 0);
          const float average = total / filter_count;
 
          output_data[output_data_offset] =
            std::min(std::max(average, params.float_activation_min), params.float_activation_max);
        }
      }
    }
  }
}

◆ AveragePool() [2/2]

void luci_interpreter_pal::AveragePool	(	const PoolParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const uint8_t *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		uint8_t *	output_data,
		luci_interpreter::DataType	data_type
	)

inline

Definition at line 27 of file PALAveragePool2D.h.

{
  cmsis_nn_dims input_dims;
  cmsis_nn_dims output_dims;
  cmsis_nn_pool_params pool_params;
  cmsis_nn_dims filter_dims;
  cmsis_nn_context ctx;
 
  const int depth = input_shape.dims(3);
  const int output_width = output_shape.dims(2);
 
  input_dims.n = 1;
  input_dims.h = input_shape.dims(1);
  input_dims.w = input_shape.dims(2);
  input_dims.c = depth;
 
  output_dims.n = 1;
  output_dims.h = output_shape.dims(1);
  output_dims.w = output_width;
  output_dims.c = depth;
 
  pool_params.stride.h = params.stride_height;
  pool_params.stride.w = params.stride_width;
  pool_params.padding.h = params.padding_values.height;
  pool_params.padding.w = params.padding_values.width;
  pool_params.activation.min = params.quantized_activation_min;
  pool_params.activation.max = params.quantized_activation_max;
 
  filter_dims.n = 1;
  filter_dims.h = params.filter_height;
  filter_dims.w = params.filter_width;
  filter_dims.c = 1;
 
  const int32_t buffer_size = data_type == luci_interpreter::DataType::S16
                                ? arm_avgpool_s16_get_buffer_size(output_width, depth)
                                : arm_avgpool_s8_get_buffer_size(output_width, depth);
  int8_t *buffer = nullptr;
  if (buffer_size > 0)
  {
    buffer = new int8_t[buffer_size];
  }
 
  ctx.buf = buffer;
  ctx.size = buffer_size;
 
  if (data_type == luci_interpreter::DataType::S8)
  {
    arm_avgpool_s8(&ctx, &pool_params, &input_dims,
                   luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
                   &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
  }
  else
  {
    arm_avgpool_s16(&ctx, &pool_params, &input_dims,
                    luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
                    &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
  }
 
  if (buffer_size > 0)
    delete[] buffer;
}

◆ AveragePool< int8_t >() [1/3]

template<>

void luci_interpreter_pal::AveragePool< int8_t >	(	const tflite::PoolParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 47 of file PALAveragePool2d.h.

{
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  assert(scratchpad_data != nullptr);
 
  const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
  assert(batches == 1);
 
  const int depth = tflite::MatchingDim(input_shape, 3, output_shape, 3);
 
  cmsis_nn_dims input_dims;
  input_dims.n = 1;
  input_dims.h = input_shape.Dims(1);
  input_dims.w = input_shape.Dims(2);
  input_dims.c = depth;
 
  cmsis_nn_dims output_dims;
  output_dims.n = 1;
  output_dims.h = output_shape.Dims(1);
  output_dims.w = output_shape.Dims(2);
  output_dims.c = depth;
 
  cmsis_nn_pool_params pool_params;
  pool_params.stride.h = params.stride_height;
  pool_params.stride.w = params.stride_width;
  pool_params.padding.h = params.padding_values.height;
  pool_params.padding.w = params.padding_values.width;
  pool_params.activation.min = params.quantized_activation_min;
  pool_params.activation.max = params.quantized_activation_max;
 
  cmsis_nn_dims filter_dims;
  filter_dims.n = 1;
  filter_dims.h = params.filter_height;
  filter_dims.w = params.filter_width;
  filter_dims.c = 1;
 
  cmsis_nn_context ctx;
  ctx.buf = scratchpad_data;
  ctx.size = scratchpad_shape.Dims(0);
  auto res = arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims, &output_dims,
                            output_data);
  assert(res == ARM_MATH_SUCCESS);
}

References output_shape.

◆ AveragePool< int8_t >() [2/3]

template<>

void luci_interpreter_pal::AveragePool< int8_t >	(	const tflite::PoolParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 45 of file PALAveragePool2d.h.

{
  (void)scratchpad_shape;
  (void)scratchpad_data;
 
  tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
                                             output_data);
}

References output_shape.

◆ AveragePool< int8_t >() [3/3]

template<>

void luci_interpreter_pal::AveragePool< int8_t >	(	const tflite::PoolParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 45 of file PALAveragePool2d.h.

{
  (void)scratchpad_shape;
  (void)scratchpad_data;
 
  tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
                                             output_data);
}

References output_shape.

◆ BatchMatMul()

void luci_interpreter_pal::BatchMatMul	(	const tflite::RuntimeShape &	lhs_shape,
		const float *	lhs_data,
		const tflite::RuntimeShape &	rhs_shape,
		const float *	rhs_data,
		const tflite::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 24 of file PALBatchMatMul.h.

{
  tflite::reference_ops::BatchMatMul(lhs_shape, lhs_data, rhs_shape, rhs_data, output_shape,
                                     output_data);
}

References output_shape.

Referenced by luci_interpreter::kernels::BatchMatMul::execute().

◆ BatchToSpaceND()

template<typename T >

void luci_interpreter_pal::BatchToSpaceND	(	const luci_interpreter::RuntimeShape &	unextended_input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	unextended_input2_shape,
		const int32_t *	block_shape_data,
		const luci_interpreter::RuntimeShape &	unextended_input3_shape,
		const int32_t *	crops_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 46 of file PALBatchToSpaceND.h.

{
  const luci_interpreter::RuntimeShape input1_shape =
    extendShapeBatchToSpace(unextended_input1_shape);
  const luci_interpreter::RuntimeShape output_shape =
    extendShapeBatchToSpace(unextended_output_shape);
 
  const int output_width = output_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_batch_size = output_shape.dims(0);
 
  const int depth = input1_shape.dims(3);
  const int input_width = input1_shape.dims(2);
  const int input_height = input1_shape.dims(1);
  const int input_batch_size = input1_shape.dims(0);
 
  const int block_shape_height = block_shape_data[0];
  const int block_shape_width =
    unextended_input1_shape.dimensionsCount() == 4 ? block_shape_data[1] : 1;
  const int crops_top = crops_data[0];
  const int crops_left = unextended_input1_shape.dimensionsCount() == 4 ? crops_data[2] : 0;
  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
  {
    const int out_batch = in_batch % output_batch_size;
    const int spatial_offset = in_batch / output_batch_size;
    for (int in_h = 0; in_h < input_height; ++in_h)
    {
      const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
      if (out_h < 0 || out_h >= output_height)
      {
        continue;
      }
      for (int in_w = 0; in_w < input_width; ++in_w)
      {
        const int out_w =
          in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
 
        if (out_w < 0 || out_w >= output_width)
        {
          continue;
        }
        T *out = output_data + offset(output_shape.dimsData(), out_batch, out_h, out_w, 0);
        const T *in = input1_data + offset(input1_shape.dimsData(), in_batch, in_h, in_w, 0);
        memcpy(out, in, depth * sizeof(T));
      }
    }
  }
}

References BatchToSpaceND(), luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), offset(), and output_shape.

Referenced by BatchToSpaceND().

◆ BinaryOp()

template<typename T , typename Fn >

void luci_interpreter_pal::BinaryOp	(	const int	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 56 of file PALBinaryOpCommon.h.

{
  Fn func;
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = func(input1_data[i], input2_data[i]);
  }
}

◆ BroadcastAdd4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastAdd4DSlow	(	const ArithmeticParams &	params,
		const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 36 of file PALAddCommon.h.

{
  BroadcastArithmeticOp4DSlow<T, AddFn<T>>(params, input1_shape, input1_data, input2_shape,
                                           input2_data, output_shape, output_data);
}

References output_shape.

◆ BroadcastArithmeticOp4DSlow()

template<typename T , typename Fn >

void luci_interpreter_pal::BroadcastArithmeticOp4DSlow	(	const ArithmeticParams &	params,
		const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 72 of file PALArithmeticOpCommon.h.

{
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
  const luci_interpreter::RuntimeShape extended_output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
 
  T activation_min, activation_max;
  getActivationParams(params, &activation_min, &activation_max);
 
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
  // col, channel), with extents (batches, height, width, depth), with the
  // trailing dimension changing most rapidly (channels has the smallest stride,
  // typically 1 element).
  //
  // In generated C code, we store arrays with the dimensions reversed. The
  // first dimension has smallest stride.
  //
  // We name our variables by their Tensorflow convention, but generate C code
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
  Fn func;
  for (int b = 0; b < extended_output_shape.dims(0); ++b)
  {
    for (int y = 0; y < extended_output_shape.dims(1); ++y)
    {
      for (int x = 0; x < extended_output_shape.dims(2); ++x)
      {
        for (int c = 0; c < extended_output_shape.dims(3); ++c)
        {
          const int output_data_offset =
            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
              extended_output_shape.dims(3) +
            c;
 
          output_data[output_data_offset] =
            std::min(std::max(func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
                                   input2_data[subscriptToIndex(desc2, b, y, x, c)]),
                              activation_min),
                     activation_max);
        }
      }
    }
  }
}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), getActivationParams(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastBinaryOp4DSlow()

template<typename T , typename Fn >

void luci_interpreter_pal::BroadcastBinaryOp4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const float *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const float *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 67 of file PALBinaryOpCommon.h.

{
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
 
  const luci_interpreter::RuntimeShape extended_output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
 
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
  // col, channel), with extents (batches, height, width, depth), with the
  // trailing dimension changing most rapidly (channels has the smallest stride,
  // typically 1 element).
  //
  // In generated C code, we store arrays with the dimensions reversed. The
  // first dimension has smallest stride.
  //
  // We name our variables by their Tensorflow convention, but generate C code
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
 
  Fn func;
  for (int b = 0; b < extended_output_shape.dims(0); ++b)
  {
    for (int y = 0; y < extended_output_shape.dims(1); ++y)
    {
      for (int x = 0; x < extended_output_shape.dims(2); ++x)
      {
        for (int c = 0; c < extended_output_shape.dims(3); ++c)
        {
          const int output_data_offset =
            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
              extended_output_shape.dims(3) +
            c;
 
          output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
                                                 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
        }
      }
    }
  }
}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastComparison4DSlowNoScaling()

template<typename T >

void luci_interpreter_pal::BroadcastComparison4DSlowNoScaling	(	const ComparisonParams &	op_params,
		const luci_interpreter::RuntimeShape &	unextended_input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	unextended_input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		bool *	output_data,
		bool	FT, T
	)

inline

Definition at line 144 of file PALComparisons.h.

{
  const BroadcastComparison4DSlowCommon dims = BroadcastComparison4DSlowPreprocess(
    unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
 
  for (int b = 0; b < dims.output_shape.dims(0); ++b)
  {
    for (int y = 0; y < dims.output_shape.dims(1); ++y)
    {
      for (int x = 0; x < dims.output_shape.dims(2); ++x)
      {
        for (int c = 0; c < dims.output_shape.dims(3); ++c)
        {
          const int output_data_offset =
            ((b * dims.output_shape.dims(1) + y) * dims.output_shape.dims(2) + x) *
              dims.output_shape.dims(3) +
            c;
          output_data[output_data_offset] =
            F(input1_data[subscriptToIndex(dims.desc1, b, y, x, c)],
              input2_data[subscriptToIndex(dims.desc2, b, y, x, c)]);
        }
      }
    }
  }
}

References subscriptToIndex().

◆ BroadcastComparison4DSlowWithScaling()

template<typename T >

void luci_interpreter_pal::BroadcastComparison4DSlowWithScaling	(	const ComparisonParams &	op_params,
		const luci_interpreter::RuntimeShape &	unextended_input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	unextended_input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		bool *	output_data,
		bool	FT, T
	)

inline

Definition at line 69 of file PALComparisons.h.

{
  const BroadcastComparison4DSlowCommon dims = BroadcastComparison4DSlowPreprocess(
    unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
 
  int left_shift = op_params.left_shift;
  int32_t input1_offset = op_params.input1_offset;
  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
  int32_t input2_offset = op_params.input2_offset;
  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;
 
  for (int b = 0; b < dims.output_shape.dims(0); ++b)
  {
    for (int y = 0; y < dims.output_shape.dims(1); ++y)
    {
      for (int x = 0; x < dims.output_shape.dims(2); ++x)
      {
        for (int c = 0; c < dims.output_shape.dims(3); ++c)
        {
          const int32_t input1_val =
            input1_offset + input1_data[subscriptToIndex(dims.desc1, b, y, x, c)];
          const int32_t input2_val =
            input2_offset + input2_data[subscriptToIndex(dims.desc2, b, y, x, c)];
          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
          const int32_t scaled_input1_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, input1_multiplier, input1_shift);
          const int32_t scaled_input2_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, input2_multiplier, input2_shift);
 
          const int output_data_offset =
            ((b * dims.output_shape.dims(1) + y) * dims.output_shape.dims(2) + x) *
              dims.output_shape.dims(3) +
            c;
          output_data[output_data_offset] = F(scaled_input1_val, scaled_input2_val);
        }
      }
    }
  }
}

References luci_interpreter_pal::ComparisonParams::input1_multiplier, luci_interpreter_pal::ComparisonParams::input1_offset, luci_interpreter_pal::ComparisonParams::input1_shift, luci_interpreter_pal::ComparisonParams::input2_multiplier, luci_interpreter_pal::ComparisonParams::input2_offset, luci_interpreter_pal::ComparisonParams::input2_shift, luci_interpreter_pal::ComparisonParams::left_shift, multiplyByQuantizedMultiplierSmallerThanOneExp(), and subscriptToIndex().

◆ BroadcastDiv4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastDiv4DSlow	(	const ArithmeticParams &	params,
		const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 41 of file PALDiv.h.

{
  BroadcastArithmeticOp4DSlow<T, DivFn<T>>(params, input1_shape, input1_data, input2_shape,
                                           input2_data, output_shape, output_data);
}

References output_shape.

◆ BroadcastFloorDiv4DSlow()

void luci_interpreter_pal::BroadcastFloorDiv4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const float *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const float *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 33 of file PALFloorDivCommon.h.

{
  BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
                                                    input2_data, output_shape, output_data);
}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloorDiv().

◆ BroadcastFloorMod4DSlow()

void luci_interpreter_pal::BroadcastFloorMod4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const float *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const float *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 33 of file PALFloorModCommon.h.

{
  BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
                                                    input2_data, output_shape, output_data);
}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloorMod().

◆ BroadcastImpl()

template<int N>

void luci_interpreter_pal::BroadcastImpl	(	const NdArrayDesc< N > &	input_desc,
		const uint8_t *	input_data,
		const NdArrayDesc< N > &	output_desc,
		uint8_t *	output_data,
		int	indexes[N],
		int	dim,
		const int	last_broadcasting_dim,
		const uint32_t	type_size
	)

Definition at line 30 of file PALBroadcastTo.h.

{
  // Copy data from input to output.
  if (dim == last_broadcasting_dim)
  {
    int copy_size = output_desc.strides[dim] * type_size;
    const uint8_t *data_src = input_data + subscriptToIndex(input_desc, indexes) * type_size;
    uint8_t *data_dst = output_data + subscriptToIndex(output_desc, indexes) * type_size;
    for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size)
    {
      memcpy(data_dst, data_src, copy_size);
    }
    return;
  }
 
  // Recursive call to find the next broadcasting.
  for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim]; ++indexes[dim])
  {
    BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, dim + 1,
                     last_broadcasting_dim, type_size);
  }
 
  // Duplicate data in output tensor.
  indexes[dim] = 0;
  if (input_desc.extents[dim] != output_desc.extents[dim])
  {
    int copy_size = output_desc.strides[dim] * type_size;
    uint8_t *data_src = output_data + subscriptToIndex(output_desc, indexes) * type_size;
    uint8_t *data_dst = data_src + copy_size;
    for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size)
    {
      memcpy(data_dst, data_src, copy_size);
    }
  }
}

References luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter_pal::NdArrayDesc< N >::strides, and subscriptToIndex().

◆ BroadcastMaximum4DSlow()

void luci_interpreter_pal::BroadcastMaximum4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const float *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const float *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 32 of file PALMaximumCommon.h.

{
  BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
                                                   input2_data, output_shape, output_data);
}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleMaximum().

◆ BroadcastMinimum4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastMinimum4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 33 of file PALMinimumCommon.h.

{
  BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
                                                   input2_data, output_shape, output_data);
}

References output_shape.

◆ BroadcastMul4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastMul4DSlow	(	const ArithmeticParams &	params,
		const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 41 of file PALMulCommon.h.

{
  BroadcastArithmeticOp4DSlow<T, MulFn<T>>(params, input1_shape, input1_data, input2_shape,
                                           input2_data, output_shape, output_data);
}

References BroadcastMul4DSlow(), and output_shape.

Referenced by BroadcastMul4DSlow().

◆ BroadcastPrelu4DSlowFloat()

void luci_interpreter_pal::BroadcastPrelu4DSlowFloat	(	const luci_interpreter::RuntimeShape &	unextended_input1_shape,
		const float *	input1_data,
		const luci_interpreter::RuntimeShape &	unextended_input2_shape,
		const float *	input2_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		float *	output_data
	)

Definition at line 28 of file PALPreluCommon.h.

{
  const luci_interpreter::RuntimeShape output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.dims(0); ++b)
  {
    for (int y = 0; y < output_shape.dims(1); ++y)
    {
      for (int x = 0; x < output_shape.dims(2); ++x)
      {
        for (int c = 0; c < output_shape.dims(3); ++c)
        {
          auto out_idx = offset(output_shape.dimsData(), b, y, x, c);
          auto in1_idx = subscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = subscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
          output_data[out_idx] = in1_val >= 0.0f ? in1_val : in1_val * in2_val;
        }
      }
    }
  }
}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), offset(), output_shape, and subscriptToIndex().

Referenced by luci_interpreter::execute_kernel_CirclePRelu().

◆ BroadcastSub4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastSub4DSlow	(	const ArithmeticParams &	params,
		const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 33 of file PALSub.h.

{
  BroadcastArithmeticOp4DSlow<T, SubFn<T>>(params, input1_shape, input1_data, input2_shape,
                                           input2_data, output_shape, output_data);
}

References output_shape.

◆ BroadcastTISO4DSlow()

template<typename T >

void luci_interpreter_pal::BroadcastTISO4DSlow	(	const luci_interpreter::RuntimeShape &	input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	input2_shape,
		const T *	input2_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data,
		std::function< const T &(const T &, const T &)>	func
	)

inline

Definition at line 27 of file Broadcast.h.

{
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
 
  const luci_interpreter::RuntimeShape extended_output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
 
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
  // col, channel), with extents (batches, height, width, depth), with the
  // trailing dimension changing most rapidly (channels has the smallest stride,
  // typically 1 element).
  //
  // In generated C code, we store arrays with the dimensions reversed. The
  // first dimension has smallest stride.
  //
  // We name our variables by their Tensorflow convention, but generate C code
  // nesting loops such that the innermost loop has the smallest stride for the
  // best cache behavior.
 
  for (int b = 0; b < extended_output_shape.dims(0); ++b)
  {
    for (int y = 0; y < extended_output_shape.dims(1); ++y)
    {
      for (int x = 0; x < extended_output_shape.dims(2); ++x)
      {
        for (int c = 0; c < extended_output_shape.dims(3); ++c)
        {
          const int output_data_offset =
            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
              extended_output_shape.dims(3) +
            c;
 
          output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
                                                 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
        }
      }
    }
  }
}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastTo()

template<int N>

void luci_interpreter_pal::BroadcastTo	(	const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const uint8_t *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		uint8_t *	output_data,
		luci_interpreter::DataType	data_type
	)

inline

Definition at line 69 of file PALBroadcastTo.h.

{
  NdArrayDesc<N> input_desc;
  NdArrayDesc<N> output_desc;
  copyDimsToDesc(luci_interpreter::RuntimeShape::extendedShape(N, unextended_input_shape),
                 &input_desc);
  copyDimsToDesc(luci_interpreter::RuntimeShape::extendedShape(N, unextended_output_shape),
                 &output_desc);
 
  // Get the last dimension has broadcasting. At this dimension, the data is
  // copied from input tensor to output tensor.
  int last_broadcast_dim = -1;
  for (int i = N - 1; i >= 0; --i)
  {
    if (input_desc.extents[i] != output_desc.extents[i])
    {
      last_broadcast_dim = i;
      break;
    }
  }
 
  // If non-broadcasting, just copy data from input to output tensor.
  if (last_broadcast_dim == -1)
  {
    memcpy(output_data, input_data, unextended_input_shape.flatSize() * sizeof(data_type));
    return;
  }
 
  // Broadcasting using memcpy.
  int indexes[N] = {0};
  BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, 0, last_broadcast_dim,
                   luci_interpreter::size(data_type));
}

References BroadcastTo(), copyDimsToDesc(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter::RuntimeShape::flatSize(), and luci::size().

Referenced by BroadcastTo().

◆ calculateGRU()

void luci_interpreter_pal::calculateGRU	(	const float *	input_data,
		const float *	weight_input_data,
		const float *	weight_hidden_data,
		const float *	bias_input_data,
		const float *	bias_hidden_data,
		float *	output_data,
		const tflite::RuntimeShape &	input_shape,
		const tflite::RuntimeShape &	output_shape,
		const tflite::RuntimeShape &	weight_input_shape,
		const tflite::RuntimeShape &	weight_hidden_shape,
		float *	output_input_data,
		float *	output_hidden_data,
		const tflite::RuntimeShape &	output_shape_fc
	)

Definition at line 59 of file PALGRU.h.

{
  tflite::FullyConnectedParams op_params{};
  // As FC nodes doesn't have any activations inside GRU, let' use just numeric limits
  op_params.float_activation_min = std::numeric_limits<float>::lowest();
  op_params.float_activation_max = std::numeric_limits<float>::max();
 
  // FC Input
  tflite::RuntimeShape bias_input_shape{weight_input_shape.Dims(0)};
  tflite::reference_ops::FullyConnected(op_params, output_shape, output_data, weight_input_shape,
                                        weight_input_data, bias_input_shape, bias_input_data,
                                        output_shape_fc, output_input_data);
 
  // FC Hidden
  tflite::RuntimeShape bias_hidden_shape{weight_hidden_shape.Dims(0)};
  // Note: input for this FC node will be saved without intermediate buffer
  tflite::reference_ops::FullyConnected(op_params, input_shape, input_data, weight_hidden_shape,
                                        weight_hidden_data, bias_hidden_shape, bias_hidden_data,
                                        output_shape_fc, output_hidden_data);
 
  int num_elements = output_shape_fc.Dims(1) / 3;
 
  float *second_hidden_part = output_hidden_data + num_elements;
  float *second_input_part = output_input_data + num_elements;
 
  float *third_hidden_part = second_hidden_part + num_elements;
  float *third_input_part = second_input_part + num_elements;
 
  // Calculate Left part
  for (int i = 0; i < num_elements; ++i)
  {
    output_input_data[i] += output_hidden_data[i];
  }
 
  Logistic(num_elements, output_input_data, output_input_data);
 
  // Calculate most left mul
  float *most_left_part_final = output_input_data;
  float *first_part = output_input_data;
  for (int i = 0; i < num_elements; ++i)
  {
    output_data[i] *= most_left_part_final[i];
    first_part[i] = 1.0f - first_part[i];
  }
 
  // Calc second part
  for (int i = 0; i < num_elements; ++i)
  {
    second_hidden_part[i] += second_input_part[i];
  }
 
  Logistic(num_elements, second_hidden_part, second_hidden_part);
 
  for (int i = 0; i < num_elements; ++i)
  {
    second_hidden_part[i] *= third_input_part[i];
    second_hidden_part[i] += third_hidden_part[i];
  }
 
  for (int i = 0; i < num_elements; ++i)
  {
    if (second_hidden_part[i] > 19)
    {
      second_hidden_part[i] = 1;
    }
    else if (second_hidden_part[i] < -19)
    {
      second_hidden_part[i] = -1;
    }
    else
    {
      second_hidden_part[i] = std::tanh(second_hidden_part[i]);
    }
  }
 
  for (int i = 0; i < num_elements; ++i)
  {
    second_hidden_part[i] *= first_part[i];
    output_data[i] += second_hidden_part[i];
  }
}

References Logistic(), and output_shape.

Referenced by GRU().

◆ Ceil()

void luci_interpreter_pal::Ceil	(	const int32_t	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 28 of file PALCeil.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::ceil(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleCeil().

◆ ComparisonNoScaling()

template<typename T >

void luci_interpreter_pal::ComparisonNoScaling	(	const int64_t	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		bool *	output_data,
		bool	FT, T
	)

inline

Definition at line 59 of file PALComparisons.h.

{
  for (int64_t i = 0; i < flat_size; ++i)
  {
    output_data[i] = F(input1_data[i], input2_data[i]);
  }
}

◆ ComparisonWithScaling()

template<typename T >

void luci_interpreter_pal::ComparisonWithScaling	(	const ComparisonParams &	op_params,
		const int64_t	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		bool *	output_data,
		bool	FT, T
	)

inline

Definition at line 117 of file PALComparisons.h.

{
  int left_shift = op_params.left_shift;
  int32_t input1_offset = op_params.input1_offset;
  int32_t input1_multiplier = op_params.input1_multiplier;
  int input1_shift = op_params.input1_shift;
  int32_t input2_offset = op_params.input2_offset;
  int32_t input2_multiplier = op_params.input2_multiplier;
  int input2_shift = op_params.input2_shift;
 
  for (int64_t i = 0; i < flat_size; ++i)
  {
    const int32_t input1_val = input1_offset + input1_data[i];
    const int32_t input2_val = input2_offset + input2_data[i];
    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
    const int32_t scaled_input1_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
      shifted_input1_val, input1_multiplier, input1_shift);
    const int32_t scaled_input2_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
      shifted_input2_val, input2_multiplier, input2_shift);
    output_data[i] = F(scaled_input1_val, scaled_input2_val);
  }
}

References luci_interpreter_pal::ComparisonParams::input1_multiplier, luci_interpreter_pal::ComparisonParams::input1_offset, luci_interpreter_pal::ComparisonParams::input1_shift, luci_interpreter_pal::ComparisonParams::input2_multiplier, luci_interpreter_pal::ComparisonParams::input2_offset, luci_interpreter_pal::ComparisonParams::input2_shift, luci_interpreter_pal::ComparisonParams::left_shift, and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ ComputeInterpolationValues()

void luci_interpreter_pal::ComputeInterpolationValues	(	const float	value,
		const float	scale,
		const bool	half_pixel_centers,
		int32_t	input_size,
		float *	scaled_value,
		int32_t *	lower_bound,
		int32_t *	upper_bound
	)

inline

Definition at line 39 of file PALResizeBilinear.h.

{
  if (half_pixel_centers)
  {
    *scaled_value = (value + 0.5f) * scale - 0.5f;
  }
  else
  {
    *scaled_value = value * scale;
  }
  float scaled_value_floor = std::floor(*scaled_value);
  *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
  *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
}

◆ Concatenation()

template<typename Scalar >

void luci_interpreter_pal::Concatenation	(	const ConcatenationParams &	params,
		const luci_interpreter::RuntimeShape const	input_shapes,
		const Scalar const	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		Scalar *	output_data
	)

inline

Definition at line 28 of file PALConcatenation.h.

{
  int axis = params.axis;
  int inputs_count = params.inputs_count;
  const int concat_dimensions = output_shape.dimensionsCount();
 
  int64_t concat_size = 0;
  for (int i = 0; i < inputs_count; i++)
  {
    concat_size += input_shapes[i]->dims(axis);
  }
  int64_t outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= output_shape.dims(i);
  }
  // For all input arrays,
  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
  int64_t base_inner_size = 1;
  for (int i = axis + 1; i < concat_dimensions; ++i)
  {
    base_inner_size *= output_shape.dims(i);
  }
 
  Scalar *output_ptr = output_data;
  for (int k = 0; k < outer_size; k++)
  {
    for (int i = 0; i < inputs_count; ++i)
    {
      const int copy_size = input_shapes[i]->dims(axis) * base_inner_size;
      const Scalar *input_ptr = input_data[i] + k * copy_size;
      memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
      output_ptr += copy_size;
    }
  }
}

References luci_interpreter_pal::ConcatenationParams::axis, luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::ConcatenationParams::inputs_count, and output_shape.

◆ copyDimsToDesc()

template<int N>

void luci_interpreter_pal::copyDimsToDesc	(	const luci_interpreter::RuntimeShape &	input_shape,
		NdArrayDesc< N > *	desc_out
	)

inline

Definition at line 47 of file ProcessBroadcastShapes.h.

{
  int desc_stride = 1;
  for (int i = N - 1; i >= 0; --i)
  {
    desc_out->extents[i] = input_shape.dims(i);
    desc_out->strides[i] = desc_stride;
    desc_stride *= input_shape.dims(i);
  }
}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::NdArrayDesc< N >::extents, and luci_interpreter_pal::NdArrayDesc< N >::strides.

Referenced by BroadcastTo(), and TransposeImpl().

◆ Cos()

void luci_interpreter_pal::Cos	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 27 of file PALCosCommon.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::cos(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleCos().

◆ DepthToSpace()

template<typename T >

void luci_interpreter_pal::DepthToSpace	(	const int32_t	block_size,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 29 of file PALDepthToSpace.h.

{
  const luci_interpreter::RuntimeShape input_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
  const luci_interpreter::RuntimeShape output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
 
  const int output_depth = output_shape.dims(3);
  const int output_width = output_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_batch = output_shape.dims(0);
 
  for (int out_b = 0; out_b < output_batch; ++out_b)
  {
    for (int out_h = 0; out_h < output_height; ++out_h)
    {
      for (int out_w = 0; out_w < output_width; ++out_w)
      {
        for (int out_d = 0; out_d < output_depth; ++out_d)
        {
          const int in_d =
            out_d + ((out_h % block_size) * block_size + out_w % block_size) * output_depth;
 
          const int in_w = out_w / block_size;
          const int in_h = out_h / block_size;
          const int in_b = out_b;
 
          const int input_index = offset(input_shape.dimsData(), in_b, in_h, in_w, in_d);
          const int output_index = offset(output_shape.dimsData(), out_b, out_h, out_w, out_d);
 
          output_data[output_index] = input_data[input_index];
        }
      }
    }
  }
}

References DepthToSpace(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), and output_shape.

Referenced by DepthToSpace().

◆ DepthwiseConvPerChannel< int8_t >() [1/3]

template<>

void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t >	(	const tflite::DepthwiseParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 58 of file PALDepthwiseConv2d.h.

{
  if (scratchpad_data)
  {
    cmsis_nn_dw_conv_params dw_conv_params;
    dw_conv_params.dilation.h = params.dilation_height_factor;
    dw_conv_params.dilation.w = params.dilation_width_factor;
    assert(dw_conv_params.dilation.h == 1);
    assert(dw_conv_params.dilation.w == 1);
 
    dw_conv_params.input_offset = params.input_offset;
    dw_conv_params.output_offset = params.output_offset;
    dw_conv_params.stride.h = params.stride_height;
    dw_conv_params.stride.w = params.stride_width;
    dw_conv_params.padding.h = params.padding_values.height;
    dw_conv_params.padding.w = params.padding_values.width;
 
    dw_conv_params.activation.min = params.quantized_activation_min;
    dw_conv_params.activation.max = params.quantized_activation_max;
    dw_conv_params.ch_mult = params.depth_multiplier;
 
    cmsis_nn_per_channel_quant_params quant_params;
    int32_t output_multiplier = params.output_multiplier;
    int32_t output_shift = params.output_shift;
 
    quant_params.multiplier = &output_multiplier;
    quant_params.shift = &output_shift;
 
    assert(dw_conv_params.activation.min <= dw_conv_params.activation.max);
    const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0);
    const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3);
    if (bias_data)
    {
      assert(bias_shape.FlatSize() == output_depth);
    }
 
    cmsis_nn_dims input_dims;
    input_dims.n = batch_size;
    input_dims.h = input_shape.Dims(1);
    input_dims.w = input_shape.Dims(2);
    input_dims.c = input_shape.Dims(3);
 
    cmsis_nn_dims filter_dims;
    filter_dims.n = filter_shape.Dims(0);
    filter_dims.h = filter_shape.Dims(1);
    filter_dims.w = filter_shape.Dims(2);
    filter_dims.c = output_depth;
 
    cmsis_nn_dims bias_dims;
    bias_dims.n = 1;
    bias_dims.h = 1;
    bias_dims.w = 1;
    bias_dims.c = output_depth;
 
    cmsis_nn_dims output_dims;
    output_dims.n = batch_size;
    output_dims.h = output_shape.Dims(1);
    output_dims.w = output_shape.Dims(2);
    output_dims.c = output_depth;
 
    cmsis_nn_context ctx;
    ctx.buf = scratchpad_data;
    ctx.size = scratchpad_shape.Dims(0);
 
    auto res = arm_depthwise_conv_wrapper_s8(&ctx, &dw_conv_params, &quant_params, &input_dims,
                                             input_data, &filter_dims, filter_data, &bias_dims,
                                             bias_data, &output_dims, output_data);
    assert(res == ARM_MATH_SUCCESS);
  }
  else
  {
    tflite::reference_integer_ops::DepthwiseConvPerChannel(
      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
      bias_shape, bias_data, output_shape, output_data);
  }
}

References output_shape.

◆ DepthwiseConvPerChannel< int8_t >() [2/3]

template<>

void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t >	(	const tflite::DepthwiseParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 57 of file PALDepthwiseConv2d.h.

{
  (void)scratchpad_shape;
  (void)scratchpad_data;
  tflite::reference_integer_ops::DepthwiseConvPerChannel(
    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
    bias_shape, bias_data, output_shape, output_data);
}

References output_shape.

◆ DepthwiseConvPerChannel< int8_t >() [3/3]

template<>

void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t >	(	const tflite::DepthwiseParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data,
		const tflite::RuntimeShape &	scratchpad_shape,
		int8_t *	scratchpad_data
	)

inline

Definition at line 57 of file PALDepthwiseConv2d.h.

{
  (void)scratchpad_shape;
  (void)scratchpad_data;
  tflite::reference_integer_ops::DepthwiseConvPerChannel(
    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
    bias_shape, bias_data, output_shape, output_data);
}

References output_shape.

◆ Dequantize()

template<typename InputT , typename OutputT >

void luci_interpreter_pal::Dequantize	(	const QuantizationParams &	op_params,
		const int	flat_size,
		const InputT *	input_data,
		OutputT *	output_data
	)

inline

Definition at line 27 of file PALDequantize.h.

{
  const int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
 
  for (int i = 0; i < flat_size; i++)
  {
    const int32_t val = input_data[i];
    const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
    output_data[i] = result;
  }
}

References luci_interpreter_pal::QuantizationParams::scale, and luci_interpreter_pal::QuantizationParams::zero_point.

◆ Div()

template<typename T >

void luci_interpreter_pal::Div	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 26 of file PALDiv.h.

{
  ArithmeticOp<T, DivFn<T>>(params, flat_size, input1_data, input2_data, output_data);
}

◆ DivScalar()

template<typename T >

void luci_interpreter_pal::DivScalar	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input_data,
		const T	scalar_value,
		T *	output_data
	)

inline

Definition at line 33 of file PALDiv.h.

{
  ArithmeticOpScalar<T, DivFn<T>>(params, flat_size, input_data, scalar_value, output_data);
}

◆ Elu()

void luci_interpreter_pal::Elu	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 27 of file PALElu.h.

{
  for (int i = 0; i < flat_size; i++)
  {
    float val = input_data[i];
    float result = val < 0.0f ? std::expm1(val) : val;
    output_data[i] = result;
  }
}

References Elu().

Referenced by Elu().

◆ EqualFn()

template<typename T >

bool luci_interpreter_pal::EqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 53 of file PALComparisons.h.

53{ return lhs == rhs; }

Referenced by luci_interpreter::execute_kernel_CircleEqual().

◆ eval_integer_8x8_16_lstm()

void luci_interpreter_pal::eval_integer_8x8_16_lstm	(	const luci_interpreter::Tensor *	input,
		const luci_interpreter::Tensor *	input_to_input_weights,
		const luci_interpreter::Tensor *	input_to_forget_weights,
		const luci_interpreter::Tensor *	input_to_cell_weights,
		const luci_interpreter::Tensor *	input_to_output_weights,
		const luci_interpreter::Tensor *	recurrent_to_input_weights,
		const luci_interpreter::Tensor *	recurrent_to_forget_weights,
		const luci_interpreter::Tensor *	recurrent_to_cell_weights,
		const luci_interpreter::Tensor *	recurrent_to_output_weights,
		const luci_interpreter::Tensor *	cell_to_input_weights,
		const luci_interpreter::Tensor *	cell_to_forget_weights,
		const luci_interpreter::Tensor *	cell_to_output_weights,
		const luci_interpreter::Tensor *	input_layer_norm_coefficients,
		const luci_interpreter::Tensor *	forget_layer_norm_coefficients,
		const luci_interpreter::Tensor *	cell_layer_norm_coefficients,
		const luci_interpreter::Tensor *	output_layer_norm_coefficients,
		const luci_interpreter::Tensor *	input_gate_bias,
		const luci_interpreter::Tensor *	forget_gate_bias,
		const luci_interpreter::Tensor *	cell_gate_bias,
		const luci_interpreter::Tensor *	output_gate_bias,
		const luci_interpreter::Tensor *	projection_weights,
		const luci_interpreter::Tensor *	projection_bias,
		const luci_interpreter::UnidirectionalSequenceLSTMParams &	params,
		bool	forward_sequence,
		bool	time_major,
		const luci_interpreter::IntegerLSTMParams &	integer_lstm_param,
		int32_t	output_state_zp,
		luci_interpreter::Tensor *	output_state,
		luci_interpreter::Tensor *	cell_state,
		luci_interpreter::Tensor *	output,
		int16_t *	scratch0,
		int16_t *	scratch1,
		int16_t *	scratch2,
		int16_t *	scratch3,
		int8_t *	scratch4,
		int32_t *	scratch5
	)

Definition at line 126 of file PALUnidirectionalSequenceLSTM.h.

{
  // CMSIS-NN does not support these configurations currently.
  // Please use MCU kernels instead
  const bool use_layer_norm = (forget_layer_norm_coefficients != nullptr);
  const bool use_peephole = (cell_to_output_weights != nullptr);
  const bool use_projection = (projection_weights != nullptr);
  const bool use_cifg = (input_to_input_weights == nullptr);
  const bool unsupported_config = use_layer_norm || use_peephole || use_projection || use_cifg;
 
  if (unsupported_config)
  {
    assert(false && "CMSIS-NN does not support these configurations currently");
    return;
  }
 
  const auto input_shape = input->shape();
  LUCI_INTERPRETER_CHECK(input_shape.num_dims() >= 2 && input_shape.num_dims() <= 3);
 
  cmsis_nn_lstm_context scratch_buffers;
  scratch_buffers.input_gate = scratch0;
  scratch_buffers.forget_gate = scratch1;
  scratch_buffers.cell_gate = scratch2;
  scratch_buffers.output_gate = scratch3;
  scratch_buffers.scratch = scratch4;
 
  cmsis_nn_lstm_params cmsis_lstm_params = lstm::convert_lstm_params(
    integer_lstm_param, time_major, output_state_zp,
    luci_interpreter::kernels::getTensorData<int32_t>(input_gate_bias),
    luci_interpreter::kernels::getTensorData<int32_t>(forget_gate_bias),
    luci_interpreter::kernels::getTensorData<int32_t>(cell_gate_bias),
    luci_interpreter::kernels::getTensorData<int32_t>(output_gate_bias),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(input_layer_norm_coefficients)),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(forget_layer_norm_coefficients)),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(cell_layer_norm_coefficients)),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(output_layer_norm_coefficients)));
 
  const int n_input = input_shape.dim(input_shape.num_dims() - 1);
  int max_time, n_batch;
  if (input_shape.num_dims() == 2)
  {
    max_time = 1;
    n_batch = input_shape.dim(0);
  }
  else
  {
    max_time = (time_major) ? input_shape.dim(0) : input_shape.dim(1);
    n_batch = (time_major) ? input_shape.dim(1) : input_shape.dim(0);
  }
 
  // n_cell and n_output will be the same size when there is no projection.
  const int n_cell = input_to_output_weights->shape().dim(0);
  const int n_output = recurrent_to_output_weights->shape().dim(1);
 
  cmsis_nn_lstm_dims lstm_dims;
  lstm_dims.num_inputs = n_input;
  lstm_dims.num_outputs = n_output;
  lstm_dims.num_batches = n_batch;
  lstm_dims.max_time = max_time;
 
  arm_lstm_unidirectional_s16_s8(
    &scratch_buffers, const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input)),
    &lstm_dims,
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_input_weights)),
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_forget_weights)),
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_cell_weights)),
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_output_weights)),
    const_cast<int8_t *>(
      luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_input_weights)),
    const_cast<int8_t *>(
      luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_forget_weights)),
    const_cast<int8_t *>(
      luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_cell_weights)),
    const_cast<int8_t *>(
      luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_output_weights)),
    const_cast<int16_t *>(luci_interpreter::kernels::getTensorData<int16_t>(cell_to_input_weights)),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(cell_to_forget_weights)),
    const_cast<int16_t *>(
      luci_interpreter::kernels::getTensorData<int16_t>(cell_to_output_weights)),
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(projection_weights)),
    &cmsis_lstm_params,
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(output_state)),
    const_cast<int16_t *>(luci_interpreter::kernels::getTensorData<int16_t>(cell_state)),
    const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(output)));
}

References luci_interpreter_pal::lstm::convert_lstm_params(), luci_interpreter::Shape::dim(), LUCI_INTERPRETER_CHECK, and luci_interpreter::Tensor::shape().

◆ evalLSTM()

template<typename ActivationType , typename WeightType , typename CellType , typename BiasType >

void luci_interpreter_pal::evalLSTM	(	luci_interpreter::lstm::LSTMStruct *	lstm_struct,
		luci_interpreter::lstm::LSTMParameters *	lstm_params,
		luci_interpreter::lstm::CellStateInfo *	cell_state_info,
		ActivationType *	output_state_data,
		CellType *	cell_state_data,
		CellType *	scratch0,
		CellType *	scratch1,
		CellType *	scratch2,
		CellType *	scratch3,
		luci_interpreter::BaseRuntimeGraph *	runtime_graph
	)

Definition at line 515 of file PALUnidirectionalSequenceLSTMCommon.h.

{
  lstm_internal::LstmSizeInfo size_info;
 
  size_info.time_major = lstm_struct->options->time_major();
  size_info.batch_size = size_info.time_major
                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1)
                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 0);
  size_info.time_steps = size_info.time_major
                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0)
                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 1);
  size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2);
  size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1);
 
  lstm_internal::LstmStepManager step_info(size_info);
 
  // time is the first dimention, enable batch computation
  if (size_info.time_major)
  {
    for (int t = 0; t < size_info.time_steps; t++)
    {
      lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
        lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
        scratch0, scratch1, scratch2, scratch3, runtime_graph);
      // prepare for the next time step
      step_info.updateTime();
    }
  }
  else
  {
    // batch first, unable to size the input data. single batch inference
    for (int b = 0; b < size_info.batch_size; b++)
    {
      for (int t = 0; t < size_info.time_steps; t++)
      {
        lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
          lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
          scratch0, scratch1, scratch2, scratch3, runtime_graph);
        // prepare for the next time step
        step_info.updateTime();
      }
      // prepare for the next batch
      step_info.updateBatch();
      step_info.resetTime();
    }
  }
}

References luci_interpreter_pal::lstm_internal::LstmSizeInfo::batch_size, circle_eval_diff::TensorShape::dim(), luci_interpreter::lstm::LSTMStruct::input(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::input_dimension, luci_interpreter::lstm::LSTMStruct::options, luci_interpreter::lstm::LSTMStruct::output_state(), luci_interpreter_pal::lstm_internal::LstmStepManager::resetTime(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::state_dimension, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_major, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_steps, luci_interpreter_pal::lstm_internal::LstmStepManager::updateBatch(), and luci_interpreter_pal::lstm_internal::LstmStepManager::updateTime().

◆ evalLSTM< int8_t, int8_t, int16_t, int32_t >()

template<>

void luci_interpreter_pal::evalLSTM< int8_t, int8_t, int16_t, int32_t >	(	luci_interpreter::lstm::LSTMStruct *	lstm_struct,
		luci_interpreter::lstm::LSTMParameters *	lstm_params,
		luci_interpreter::lstm::CellStateInfo *	cell_state_info,
		int8_t *	output_state_data,
		int16_t *	cell_state_data,
		int16_t *	scratch0,
		int16_t *	scratch1,
		int16_t *	scratch2,
		int16_t *	scratch3,
		luci_interpreter::BaseRuntimeGraph *	runtime_graph
	)

Definition at line 29 of file PALUnidirectionalSequenceLSTM.h.

{
  lstm_internal::LstmSizeInfo size_info;
 
  size_info.time_major = lstm_struct->options->time_major();
  size_info.batch_size = size_info.time_major
                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1)
                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 0);
  size_info.time_steps = size_info.time_major
                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0)
                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 1);
  size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2);
  size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1);
 
  lstm_internal::LstmStepManager step_info(size_info);
 
  // time is the first dimention, enable batch computation
  if (size_info.time_major)
  {
    for (int t = 0; t < size_info.time_steps; t++)
    {
      lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
        lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
        scratch0, scratch1, scratch2, scratch3, runtime_graph);
      // prepare for the next time step
      step_info.updateTime();
    }
  }
  else
  {
    // batch first, unable to size the input data. single batch inference
    for (int b = 0; b < size_info.batch_size; b++)
    {
      for (int t = 0; t < size_info.time_steps; t++)
      {
        lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
          lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
          scratch0, scratch1, scratch2, scratch3, runtime_graph);
        // prepare for the next time step
        step_info.updateTime();
      }
      // prepare for the next batch
      step_info.updateBatch();
      step_info.resetTime();
    }
  }
}

References luci_interpreter_pal::lstm_internal::LstmSizeInfo::batch_size, circle_eval_diff::TensorShape::dim(), luci_interpreter::lstm::LSTMStruct::input(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::input_dimension, luci_interpreter::lstm::LSTMStruct::options, luci_interpreter::lstm::LSTMStruct::output_state(), luci_interpreter_pal::lstm_internal::LstmStepManager::resetTime(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::state_dimension, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_major, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_steps, luci_interpreter_pal::lstm_internal::LstmStepManager::updateBatch(), and luci_interpreter_pal::lstm_internal::LstmStepManager::updateTime().

◆ Exp()

void luci_interpreter_pal::Exp	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 26 of file PALExp.h.

{
  for (int i = 0; i < flat_size; i++)
  {
    const float val = input_data[i];
    const float result = std::exp(val);
    output_data[i] = result;
  }
}

Referenced by luci_interpreter::execute_kernel_CircleExp().

◆ flatSizeSkipDim()

int luci_interpreter_pal::flatSizeSkipDim	(	const int32_t *	dims_data,
		int	skip_dim,
		int	num_dims
	)

inline

Definition at line 183 of file PALUtils.h.

{
  int flat_size = 1;
  for (int i = 0; i < num_dims; ++i)
  {
    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
  }
  return flat_size;
}

Referenced by FullyConnected(), FullyConnected(), FullyConnected(), FullyConnected< int8_t >(), L2Normalization(), and LogSoftmax().

◆ Floor()

void luci_interpreter_pal::Floor	(	const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 25 of file PALFloorCommon.h.

{
  // check that input and output dimensions are equal
  int N = input_shape.dimensionsCount();
  assert(N == output_shape.dimensionsCount());
 
  // check that sizes of all dimensions are equal
  for (int i = 0; i < N; ++i)
  {
    assert(input_shape.dims(i) == output_shape.dims(i));
  }
 
  const int flat_size = input_shape.flatSize();
  for (int i = 0; i < flat_size; i++)
  {
    int offset = i;
    output_data[offset] = std::floor(input_data[offset]);
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), offset(), and output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloor().

◆ FloorDiv()

void luci_interpreter_pal::FloorDiv	(	const int	flat_size,
		const float *	input1_data,
		const float *	input2_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALFloorDivCommon.h.

{
  BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
}

Referenced by luci_interpreter::execute_kernel_CircleFloorDiv().

◆ FloorMod()

void luci_interpreter_pal::FloorMod	(	const int	flat_size,
		const float *	input1_data,
		const float *	input2_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALFloorModCommon.h.

{
  BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
}

Referenced by luci_interpreter::execute_kernel_CircleFloorMod().

◆ FullyConnected() [1/5]

template<typename WeightType >

void luci_interpreter_pal::FullyConnected	(	const FullyConnectedParams &	params,
		const int32_t *	input_shape,
		const float *	input_data,
		const int32_t *	filter_shape,
		const WeightType *	filter_data,
		const float *	bias_data,
		const int32_t *	output_shape,
		float *	output_data,
		uint32_t	output_dims_count,
		uint32_t	weights_dims_count
	)

inline

Definition at line 72 of file PALFullyConnectedCommon.h.

{
  const float output_activation_min = params.float_activation_min;
  const float output_activation_max = params.float_activation_max;
 
  const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
  const int output_depth = output_shape[output_dims_count - 1];
  const int accum_depth = filter_shape[weights_dims_count - 1];
 
  for (int b = 0; b < batches; ++b)
  {
    const float *weight_scale_ptr = params.weights_scales;
    for (int out_c = 0; out_c < output_depth; ++out_c)
    {
      float total = 0.f;
      for (int d = 0; d < accum_depth; ++d)
      {
        auto input_value = input_data[b * accum_depth + d];
        if (std::is_same<WeightType, float>::value)
        {
          total += input_value * filter_data[out_c * accum_depth + d];
        }
        else
        {
          const float filter_scale = *weight_scale_ptr;
          const float filter_value =
            static_cast<float>(filter_data[out_c * accum_depth + d]) * filter_scale;
          total += input_value * filter_value;
        }
      }
      float bias_value = 0.0f;
      if (bias_data)
      {
        bias_value = bias_data[out_c];
      }
      output_data[out_c + output_depth * b] =
        std::min(std::max(total + bias_value, output_activation_min), output_activation_max);
      if (std::is_same<WeightType, int8_t>::value)
      {
        if (params.is_channel_wise_quant)
          weight_scale_ptr++;
      }
    }
  }
}

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::float_activation_max, luci_interpreter_pal::FullyConnectedParams::float_activation_min, FullyConnected(), luci_interpreter_pal::FullyConnectedParams::is_channel_wise_quant, output_shape, and luci_interpreter_pal::FullyConnectedParams::weights_scales.

◆ FullyConnected() [2/5]

template<typename InputType , typename WeightType , typename OutputType , typename BiasType >

void luci_interpreter_pal::FullyConnected	(	const FullyConnectedParams &	params,
		const int32_t *	input_shape,
		const InputType *	input_data,
		const int32_t *	filter_shape,
		const WeightType *	filter_data,
		const BiasType *	bias_data,
		const int32_t *	output_shape,
		OutputType *	output_data,
		uint32_t	output_dims_count,
		uint32_t	weights_dims_count
	)

inline

Definition at line 30 of file PALFullyConnectedCommon.h.

{
  const int32_t input_offset = params.input_offset;
  const int32_t filter_offset = params.weights_offset;
  const int32_t output_offset = params.output_offset;
  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
 
  const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
  const int output_depth = output_shape[output_dims_count - 1];
  const int accum_depth = filter_shape[weights_dims_count - 1];
 
  for (int b = 0; b < batches; ++b)
  {
    for (int out_c = 0; out_c < output_depth; ++out_c)
    {
      BiasType acc = 0;
      for (int d = 0; d < accum_depth; ++d)
      {
        int32_t input_val = input_data[b * accum_depth + d];
        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data)
      {
        acc += bias_data[out_c];
      }
      int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
      acc_scaled += output_offset;
      acc_scaled = std::max(acc_scaled, output_activation_min);
      acc_scaled = std::min(acc_scaled, output_activation_max);
      output_data[out_c + output_depth * b] = static_cast<OutputType>(acc_scaled);
    }
  }
}

References flatSizeSkipDim(), FullyConnected(), luci_interpreter_pal::FullyConnectedParams::input_offset, multiplyByQuantizedMultiplier(), luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected() [3/5]

template<>

void luci_interpreter_pal::FullyConnected	(	const luci_interpreter_pal::FullyConnectedParams &	,
		const int32_t *	,
		const int16_t *	,
		const int32_t *	,
		const int8_t *	,
		const int64_t *	,
		const int32_t *	,
		int16_t *	,
		uint32_t	,
		uint32_t
	)

inline

Definition at line 46 of file PALFullyConnected.h.

{
  // MARK: At this moment this operation doesn't support
  assert(false && "FullyConnected INT16 NYI");
}

References FullyConnected().

◆ FullyConnected() [4/5]

template<>

void luci_interpreter_pal::FullyConnected	(	const luci_interpreter_pal::FullyConnectedParams &	params,
		const int32_t *	,
		const int16_t *	input_data,
		const int32_t *	filter_shape,
		const int8_t *	filter_data,
		const int64_t *	bias_data,
		const int32_t *	output_shape,
		int16_t *	output_data,
		uint32_t	output_dims_count,
		uint32_t	weights_dims_count
	)

inline

Definition at line 89 of file PALFullyConnected.h.

{
  const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
  const int output_depth = output_shape[output_dims_count - 1];
  const int accum_depth = filter_shape[weights_dims_count - 1];
 
  cmsis_nn_fc_params fc_params;
  fc_params.input_offset = params.input_offset;
  fc_params.output_offset = params.output_offset;
  fc_params.filter_offset = params.weights_offset;
  fc_params.activation.min = params.quantized_activation_min;
  fc_params.activation.max = params.quantized_activation_max;
 
  cmsis_nn_per_tensor_quant_params quant_params;
  quant_params.multiplier = params.output_multiplier;
  quant_params.shift = params.output_shift;
 
  cmsis_nn_dims input_dims;
  input_dims.n = batches;
  input_dims.h = 1;
  input_dims.w = 1;
  input_dims.c = accum_depth;
 
  cmsis_nn_dims filter_dims;
  filter_dims.n = accum_depth;
  filter_dims.h = 1;
  filter_dims.w = 1;
  filter_dims.c = output_depth;
 
  cmsis_nn_dims bias_dims;
  bias_dims.n = 1;
  bias_dims.h = 1;
  bias_dims.w = 1;
  bias_dims.c = output_depth;
 
  cmsis_nn_dims output_dims;
  output_dims.n = batches;
  output_dims.h = 1;
  output_dims.w = 1;
  output_dims.c = output_depth;
 
  int32_t buf_size = arm_fully_connected_s16_get_buffer_size(&filter_dims);
  auto buffer = std::make_unique<int8_t[]>(buf_size);
  assert(buffer != nullptr);
 
  cmsis_nn_context ctx;
  ctx.buf = buffer.get();
  ctx.size = buf_size;
 
  auto res =
    arm_fully_connected_s16(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
                            filter_data, &bias_dims, bias_data, &output_dims, output_data);
  assert(res == ARM_CMSIS_NN_SUCCESS);
}

References flatSizeSkipDim(), FullyConnected(), luci_interpreter_pal::FullyConnectedParams::input_offset, luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected() [5/5]

template<>

void luci_interpreter_pal::FullyConnected	(	const luci_interpreter_pal::FullyConnectedParams &	params,
		const int32_t *	input_shape,
		const int8_t *	input_data,
		const int32_t *	filter_shape,
		const int8_t *	filter_data,
		const int32_t *	bias_data,
		const int32_t *	output_shape,
		int8_t *	output_data,
		uint32_t	,
		uint32_t
	)

inline

Definition at line 27 of file PALFullyConnected.h.

{
  // MARK: At this moment this operation doesn't support
  assert(false && "FullyConnected INT8 NYI");
  (void)params;
  (void)input_shape;
  (void)input_data;
  (void)filter_shape;
  (void)filter_data;
  (void)bias_data;
  (void)output_shape;
  (void)output_data;
}

References FullyConnected(), and output_shape.

◆ FullyConnected< int8_t >() [1/4]

template<>

void luci_interpreter_pal::FullyConnected< int8_t >	(	const luci_interpreter_pal::FullyConnectedParams &	params,
		const int32_t *	,
		const int8_t *	input_data,
		const int32_t *	filter_shape,
		const int8_t *	filter_data,
		const int32_t *	bias_data,
		const int32_t *	output_shape,
		int8_t *	output_data,
		uint32_t	output_dims_count,
		uint32_t	weights_dims_count
	)

inline

Definition at line 28 of file PALFullyConnected.h.

{
  const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
  const int output_depth = output_shape[output_dims_count - 1];
  const int accum_depth = filter_shape[weights_dims_count - 1];
 
  cmsis_nn_fc_params fc_params;
  fc_params.input_offset = params.input_offset;
  fc_params.output_offset = params.output_offset;
  fc_params.filter_offset = params.weights_offset;
  fc_params.activation.min = params.quantized_activation_min;
  fc_params.activation.max = params.quantized_activation_max;
 
  cmsis_nn_per_tensor_quant_params quant_params;
  quant_params.multiplier = params.output_multiplier;
  quant_params.shift = params.output_shift;
 
  cmsis_nn_dims input_dims;
  input_dims.n = batches;
  input_dims.h = 1;
  input_dims.w = 1;
  input_dims.c = accum_depth;
 
  cmsis_nn_dims filter_dims;
  filter_dims.n = accum_depth;
  filter_dims.h = 1;
  filter_dims.w = 1;
  filter_dims.c = output_depth;
 
  cmsis_nn_dims bias_dims;
  bias_dims.n = 1;
  bias_dims.h = 1;
  bias_dims.w = 1;
  bias_dims.c = output_depth;
 
  cmsis_nn_dims output_dims;
  output_dims.n = batches;
  output_dims.h = 1;
  output_dims.w = 1;
  output_dims.c = output_depth;
 
  int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
  auto buffer = std::make_unique<int8_t[]>(buf_size);
  assert(buffer != nullptr);
 
  cmsis_nn_context ctx;
  ctx.buf = buffer.get();
  ctx.size = buf_size;
 
  auto res =
    arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
                           filter_data, &bias_dims, bias_data, &output_dims, output_data);
  assert(res == ARM_CMSIS_NN_SUCCESS);
}

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::input_offset, luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected< int8_t >() [2/4]

template<>

void luci_interpreter_pal::FullyConnected< int8_t >	(	const tflite::FullyConnectedParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data
	)

inline

Definition at line 49 of file PALFullyConnected.h.

{
  assert(output_shape.DimensionsCount() == 2);
 
  const int batches = output_shape.Dims(0);
  const int output_depth = output_shape.Dims(1);
 
  const int filter_dim_count = filter_shape.DimensionsCount();
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 
  cmsis_nn_fc_params fc_params;
  fc_params.input_offset = params.input_offset;
  fc_params.output_offset = params.output_offset;
  fc_params.filter_offset = params.weights_offset;
  fc_params.activation.min = params.quantized_activation_min;
  fc_params.activation.max = params.quantized_activation_max;
 
  cmsis_nn_per_tensor_quant_params quant_params;
  quant_params.multiplier = params.output_multiplier;
  quant_params.shift = params.output_shift;
 
  cmsis_nn_dims input_dims;
  input_dims.n = batches;
  input_dims.h = 1;
  input_dims.w = 1;
  input_dims.c = accum_depth;
 
  cmsis_nn_dims filter_dims;
  filter_dims.n = accum_depth;
  filter_dims.h = 1;
  filter_dims.w = 1;
  filter_dims.c = output_depth;
 
  cmsis_nn_dims bias_dims;
  bias_dims.n = 1;
  bias_dims.h = 1;
  bias_dims.w = 1;
  bias_dims.c = output_depth;
 
  cmsis_nn_dims output_dims;
  output_dims.n = batches;
  output_dims.h = 1;
  output_dims.w = 1;
  output_dims.c = output_depth;
 
  int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
  auto buffer = std::make_unique<int8_t[]>(buf_size);
  assert(buffer != nullptr);
 
  cmsis_nn_context ctx;
  ctx.buf = buffer.get();
  ctx.size = buf_size;
 
  auto res =
    arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
                           filter_data, &bias_dims, bias_data, &output_dims, output_data);
  assert(res == ARM_MATH_SUCCESS);
}

References output_shape.

◆ FullyConnected< int8_t >() [3/4]

template<>

void luci_interpreter_pal::FullyConnected< int8_t >	(	const tflite::FullyConnectedParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data
	)

inline

Definition at line 48 of file PALFullyConnected.h.

{
  tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
                                                filter_data, bias_shape, bias_data, output_shape,
                                                output_data);
}

References output_shape.

◆ FullyConnected< int8_t >() [4/4]

template<>

void luci_interpreter_pal::FullyConnected< int8_t >	(	const tflite::FullyConnectedParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	filter_shape,
		const int8_t *	filter_data,
		const tflite::RuntimeShape &	bias_shape,
		const int32_t *	bias_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data
	)

inline

Definition at line 48 of file PALFullyConnected.h.

{
  tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
                                                filter_data, bias_shape, bias_data, output_shape,
                                                output_data);
}

References output_shape.

◆ GatherND()

template<typename ParamsT , typename IndicesT >

void luci_interpreter_pal::GatherND	(	luci_interpreter::RuntimeShape	params_shape,
		const ParamsT *	param_data,
		luci_interpreter::RuntimeShape	indices_shape,
		const IndicesT *	index_data,
		ParamsT *	output_data
	)

inline

Definition at line 30 of file PALGatherND.h.

{
  const int indices_dims = indices_shape.dimensionsCount();
  const int indices_nd = indices_shape.dims(indices_dims - 1);
  const int params_dims = params_shape.dimensionsCount();
 
  int n_slices = 1;
  for (int i = 0; i < indices_dims - 1; ++i)
  {
    n_slices *= indices_shape.dims(i);
  }
 
  // If indices[-1] == params.rank, fetch single elements.
  // If indices[-1] < params.rank, fetch slices.
  int slice_size = 1;
  for (int i = indices_nd; i < params_dims; ++i)
  {
    slice_size *= params_shape.dims(i);
  }
 
  int params_flat_size = params_shape.flatSize();
  int remain_flat_size = params_flat_size;
 
  // Number of elements per dimension
  int dims_to_count[MAX_INDICES_ND];
  for (int i = 0; i < indices_nd; ++i)
  {
    dims_to_count[i] = remain_flat_size / params_shape.dims(i);
    remain_flat_size = dims_to_count[i];
  }
 
  for (int i = 0; i < n_slices; ++i)
  {
    int from_pos = 0;
    for (int j = 0; j < indices_nd; ++j)
    {
      int offset = i * indices_nd + j;
      IndicesT index = index_data[offset];
      from_pos += index * dims_to_count[j];
    }
    if (from_pos < 0 || from_pos + slice_size > params_flat_size)
    {
      assert(false && "GatherND error");
      return;
    }
    std::memcpy(output_data + i * slice_size, param_data + from_pos, sizeof(ParamsT) * slice_size);
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), MAX_INDICES_ND, and offset().

◆ getActivationParams() [1/3]

template<typename P >

void luci_interpreter_pal::getActivationParams	(	const P &	params,
		float *	min,
		float *	max
	)

inline

Definition at line 99 of file PALUtils.h.

{
  *min = params.float_activation_min;
  *max = params.float_activation_max;
}

◆ getActivationParams() [2/3]

template<typename P >

void luci_interpreter_pal::getActivationParams	(	const P &	params,
		int32_t *	min,
		int32_t *	max
	)

inline

Definition at line 93 of file PALUtils.h.

{
  *min = params.quantized_activation_min;
  *max = params.quantized_activation_max;
}

Referenced by ArithmeticOp(), ArithmeticOpScalar(), and BroadcastArithmeticOp4DSlow().

◆ getActivationParams() [3/3]

template<typename P >

void luci_interpreter_pal::getActivationParams	(	const P &	params,
		int64_t *	min,
		int64_t *	max
	)

inline

Definition at line 105 of file PALUtils.h.

{
  *min = params.int64_activation_min;
  *max = params.int64_activation_max;
}

◆ getNearestNeighbor()

int32_t luci_interpreter_pal::getNearestNeighbor	(	const int	input_value,
		const int32_t	input_size,
		const int32_t	output_size,
		const bool	align_corners,
		const bool	half_pixel_centers
	)

inline

Definition at line 26 of file PALResizeNearestNeighbor.h.

{
  const float scale = (align_corners && output_size > 1)
                        ? (input_size - 1) / static_cast<float>(output_size - 1)
                        : input_size / static_cast<float>(output_size);
  const float offset = half_pixel_centers ? 0.5f : 0.0f;
  int32_t output_value =
    std::min(align_corners ? static_cast<int32_t>(std::round((input_value + offset) * scale))
                           : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
             input_size - 1);
  if (half_pixel_centers)
  {
    output_value = std::max(static_cast<int32_t>(0), output_value);
  }
  return output_value;
}

References offset().

Referenced by ResizeNearestNeighbor().

◆ GreaterEqualFn()

template<typename T >

bool luci_interpreter_pal::GreaterEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 55 of file PALComparisons.h.

55{ return lhs >= rhs; }

Referenced by luci_interpreter::execute_kernel_CircleGreaterEqual().

◆ GreaterFn()

template<typename T >

bool luci_interpreter_pal::GreaterFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 54 of file PALComparisons.h.

54{ return lhs > rhs; }

Referenced by luci_interpreter::execute_kernel_CircleGreater().

◆ GRU()

void luci_interpreter_pal::GRU	(	const float *	input_data,
		const float *	weight_input_data,
		const float *	weight_hidden_data,
		const float *	bias_input_data,
		const float *	bias_hidden_data,
		const float *	hidden_state_data,
		float *	output_data,
		float *	output_input_data,
		float *	output_hidden_data,
		const tflite::RuntimeShape &	input_shape,
		const tflite::RuntimeShape &	output_shape,
		const tflite::RuntimeShape &	weight_input_shape,
		const tflite::RuntimeShape &	weight_hidden_shape
	)

Definition at line 147 of file PALGRU.h.

{
  const int32_t time = input_shape.Dims(0);
 
  tflite::RuntimeShape output_shape_fc(2);
  output_shape_fc.SetDim(0, 1);
  output_shape_fc.SetDim(1, weight_hidden_shape.Dims(0));
 
  std::memcpy(output_data, hidden_state_data, output_shape.FlatSize() * sizeof(float));
 
  for (int i = 0; i < time; ++i)
  {
    calculateGRU(input_data, weight_input_data, weight_hidden_data, bias_input_data,
                 bias_hidden_data, output_data, input_shape, output_shape, weight_input_shape,
                 weight_hidden_shape, output_input_data, output_hidden_data, output_shape_fc);
    input_data += input_shape.Dims(2);
  }
}

References calculateGRU(), and output_shape.

◆ L2Normalization()

void luci_interpreter_pal::L2Normalization	(	const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data,
		float	epsilon = `1e-6`
	)

inline

Definition at line 27 of file PALL2Normalize.h.

{
  const int trailing_dim = input_shape.dimensionsCount() - 1;
  const int outer_size =
    flatSizeSkipDim(input_shape.dimsData(), trailing_dim, input_shape.dimensionsCount());
  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  for (int i = 0; i < outer_size; ++i)
  {
    float squared_l2_norm = 0;
    for (int c = 0; c < depth; ++c)
    {
      const float val = input_data[depth * i + c];
      squared_l2_norm += val * val;
    }
    float l2_norm = std::sqrt(squared_l2_norm);
    l2_norm = std::fmax(l2_norm, epsilon);
    for (int c = 0; c < depth; ++c)
    {
      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
    }
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), flatSizeSkipDim(), L2Normalization(), MatchingDim(), and output_shape.

Referenced by L2Normalization().

◆ L2Pool()

void luci_interpreter_pal::L2Pool	(	const PoolParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 27 of file PALL2Pool2D.h.

{
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
  const int input_height = input_shape.dims(1);
  const int input_width = input_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_width = output_shape.dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int out_y = 0; out_y < output_height; ++out_y)
    {
      for (int out_x = 0; out_x < output_width; ++out_x)
      {
        for (int channel = 0; channel < depth; ++channel)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          // Compute the boundaries of the filter region clamped so as to
          // ensure that the filter window fits in the input array.
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          float sum_squares = 0.f;
          int filter_count = 0;
          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
          {
            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
            {
              const int in_x = in_x_origin + filter_x;
              const int in_y = in_y_origin + filter_y;
              const float val =
                input_data[offset(input_shape.dimsData(), batch, in_y, in_x, channel)];
              sum_squares += val * val;
              filter_count++;
            }
          }
          assert(filter_count != 0);
          const float l2pool_result = std::sqrt(sum_squares / filter_count);
          output_data[offset(output_shape.dimsData(), batch, out_y, out_x, channel)] =
            activationFunctionWithMinMax(l2pool_result, params.float_activation_min,
                                         params.float_activation_max);
        }
      }
    }
  }
}

Referenced by L2Pool().

◆ LessEqualFn()

template<typename T >

bool luci_interpreter_pal::LessEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 52 of file PALComparisons.h.

52{ return lhs <= rhs; }

Referenced by luci_interpreter::execute_kernel_CircleLessEqual().

◆ LessFn()

template<typename T >

bool luci_interpreter_pal::LessFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 51 of file PALComparisons.h.

51{ return lhs < rhs; }

Referenced by luci_interpreter::execute_kernel_CircleLess().

◆ Log()

void luci_interpreter_pal::Log	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 26 of file PALLog.h.

{
  for (int i = 0; i < flat_size; i++)
  {
    const float val = input_data[i];
    const float result = std::log(val);
    output_data[i] = result;
  }
}

Referenced by luci_interpreter::execute_kernel_CircleLog().

◆ LogicalCommon()

void luci_interpreter_pal::LogicalCommon	(	const int	flat_size,
		const bool *	input1_data,
		const bool *	input2_data,
		bool *	output_data,
		bool(*)(bool, bool)	f
	)

inline

Definition at line 24 of file PALLogicalCommon.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = f(input1_data[i], input2_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleLogicalAnd(), and luci_interpreter::execute_kernel_CircleLogicalOr().

◆ LogicalNot()

void luci_interpreter_pal::LogicalNot	(	const int	flat_size,
		const bool *	input_data,
		bool *	output_data
	)

inline

Definition at line 24 of file PALLogicalNotCommon.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = !input_data[i];
  }
}

Referenced by luci_interpreter::execute_kernel_CircleLogicalNot().

◆ Logistic() [1/3]

void luci_interpreter_pal::Logistic	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 26 of file PALGRU.h.

{
  const float cutoff_upper = 16.619047164916992188f;
  const float cutoff_lower = -9.f;
 
  // Rational for using approximation in reference kernel.
  // 0. This approximation gives enough precision for float.
  // 1. This works around an issue on an embedded chipset where exp() does not
  // return correctly as expected - exp(x) should return inf when overflown
  // not 1.701417   IEEE 754 defines representation for inf.
  // 2. This will speed up calculation and is matching the behavior in the
  // optimized kernels. (check the definition of scalar_logistic_op<float>)
 
  for (int i = 0; i < flat_size; i++)
  {
    float val = input_data[i];
    float result;
    if (val > cutoff_upper)
    {
      result = 1.0f;
    }
    else if (val < cutoff_lower)
    {
      result = std::exp(val);
    }
    else
    {
      result = 1.f / (1.f + std::exp(-val));
    }
    output_data[i] = result;
  }
}

Referenced by calculateGRU(), luci_interpreter::execute_kernel_CircleLogistic(), luci_interpreter_pal::lstm_internal::sigmoid(), and luci_interpreter_pal::lstm_internal::sigmoid().

◆ Logistic() [2/3]

void luci_interpreter_pal::Logistic	(	const int	flat_size,
		const int8_t *	input_data,
		float	input_scale,
		int	input_zero_point,
		int8_t *	output_data,
		float	output_scale,
		int	output_zero_point
	)

inline

Definition at line 60 of file PALLogistic.h.

{
  const float cutoff_upper = 16.619047164916992188f;
  const float cutoff_lower = -9.f;
 
  // Rational for using approximation in reference kernel.
  // 0. This approximation gives enough precision for float.
  // 1. This works around an issue on an embedded chipset where exp() does not
  // return correctly as expected - exp(x) should return inf when overflown
  // not 1.701417   IEEE 754 defines representation for inf.
  // 2. This will speed up calculation and is matching the behavior in the
  // optimized kernels. (check the definition of scalar_logistic_op<float>)
 
  for (int i = 0; i < flat_size; i++)
  {
    // Dequantize.
    float val = static_cast<float>((input_data[i] - input_zero_point) * input_scale);
    float result;
    if (val > cutoff_upper)
    {
      result = 1.0f;
    }
    else if (val < cutoff_lower)
    {
      result = std::exp(val);
    }
    else
    {
      result = 1.f / (1.f + std::exp(-val));
    }
    // Requantize
    int8_t output = static_cast<int8_t>(result / output_scale + output_zero_point);
    output_data[i] = output;
  }
}

◆ Logistic() [3/3]

void luci_interpreter_pal::Logistic	(	int32_t	input_multiplier,
		int32_t	input_left_shift,
		int32_t	input_size,
		const int16_t *	ptr_input_data,
		int16_t *	ptr_output_data
	)

inline

Definition at line 98 of file PALLogistic.h.

{
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1
 
  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
  // In case of general parameter scale, multiplier 3 is taken into account
  // in TanhPrepare function and it is included in
  // input_multiplier already.
  if (input_multiplier == 0)
  { // power of two case
    input_multiplier = 3 << input_left_shift;
    input_left_shift = 0;
  }
 
  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
 
  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++)
  {
    int32_t input_data = ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
 
    // We do interpolation on unsigned values.
    uint32_t abs_input_data = abs(input_data);
 
    // We divide by 2 power of 9, because
    // we need to divide by 2 in power of 7 for
    // the input conversion + 1/4 from the scale above.
 
    // Define uh as uint32_t type not to make this function overflow.
    uint32_t uh = abs_input_data >> 9;
    uint32_t result;
 
    if (uh >= 255)
    {
      // Saturate to maximum.
      result = 0x7FFF << 10;
    }
    else
    {
      uint32_t ua = sigmoid_table_uint16[uh];
      uint32_t ub = sigmoid_table_uint16[uh + 1];
      uint32_t ut = abs_input_data & 0x1ff;
      // Interpolation is done using the fractional bit.
      result = (ua << 9) + ut * (ub - ua);
    }
 
    result = (input_data >= 0) ? (result + (1 << 9)) : ((1 << (16 + 9)) - result + (1 << 9) - 1);
 
    // Back to 16-bit.
    result >>= 10;
 
    *ptr_output_data = result;
  }
}

◆ LogSoftmax()

void luci_interpreter_pal::LogSoftmax	(	const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 28 of file PALLogSoftmax.h.

{
  const int trailing_dim = input_shape.dimensionsCount() - 1;
  const int outer_size =
    flatSizeSkipDim(input_shape.dimsData(), trailing_dim, input_shape.dimensionsCount());
  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
  for (int i = 0; i < outer_size; ++i)
  {
    // Find max element value which we'll use to ensure numerical stability
    // taking advantage of the following equality:
    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
    float max = std::numeric_limits<float>::lowest();
    for (int c = 0; c < depth; ++c)
    {
      max = std::max(max, input_data[i * depth + c]);
    }
 
    // Compute sum.
    float sum = 0.f;
    for (int c = 0; c < depth; ++c)
    {
      sum += std::exp(input_data[i * depth + c] - max);
    }
 
    // Compute result.
    const float log_sum = std::log(sum);
    for (int c = 0; c < depth; ++c)
    {
      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
    }
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), flatSizeSkipDim(), LogSoftmax(), MatchingDim(), and output_shape.

Referenced by LogSoftmax().

◆ MatchingDim()

int luci_interpreter_pal::MatchingDim	(	const luci_interpreter::RuntimeShape &	shape1,
		int	index1,
		const luci_interpreter::RuntimeShape &	shape2,
		int	index2
	)

inline

Definition at line 173 of file PALUtils.h.

{
  assert(shape1.dims(index1) == shape2.dims(index2));
  return shape1.dims(index1);
}

References luci_interpreter::RuntimeShape::dims().

Referenced by L2Normalization(), L2Pool(), LogSoftmax(), and ResizeNearestNeighbor().

◆ Maximum()

void luci_interpreter_pal::Maximum	(	const int	flat_size,
		const float *	input1_data,
		const float *	input2_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALMaximumCommon.h.

{
  BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

Referenced by luci_interpreter::execute_kernel_CircleMaximum().

◆ MaxPool() [1/2]

void luci_interpreter_pal::MaxPool	(	const PoolParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 27 of file PALMaxPool2DCommon.h.

{
  const int batches = input_shape.dims(0);
  const int depth = output_shape.dims(3);
  const int input_height = input_shape.dims(1);
  const int input_width = input_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_width = output_shape.dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int out_y = 0; out_y < output_height; ++out_y)
    {
      for (int out_x = 0; out_x < output_width; ++out_x)
      {
        for (int channel = 0; channel < depth; ++channel)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          // Compute the boundaries of the filter region clamped so as to
          // ensure that the filter window fits in the input array.
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          float max = std::numeric_limits<float>::lowest();
          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
          {
            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
            {
              const int in_x = in_x_origin + filter_x;
              const int in_y = in_y_origin + filter_y;
 
              const int input_data_offset =
                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
                  input_shape.dims(3) +
                channel;
 
              max = std::max(max, input_data[input_data_offset]);
            }
          }
          const int output_data_offset =
            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
              output_shape.dims(3) +
            channel;
 
          output_data[output_data_offset] =
            std::min(std::max(max, params.float_activation_min), params.float_activation_max);
        }
      }
    }
  }
}

◆ MaxPool() [2/2]

void luci_interpreter_pal::MaxPool	(	const PoolParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const uint8_t *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		uint8_t *	output_data,
		luci_interpreter::DataType	data_type
	)

inline

Definition at line 28 of file PALMaxPool2D.h.

{
  cmsis_nn_dims input_dims;
  cmsis_nn_dims output_dims;
  cmsis_nn_pool_params pool_params;
  cmsis_nn_dims filter_dims;
  cmsis_nn_context ctx;
 
  const int depth = input_shape.dims(3);
  const int output_width = output_shape.dims(2);
 
  input_dims.n = 1;
  input_dims.h = input_shape.dims(1);
  input_dims.w = input_shape.dims(2);
  input_dims.c = depth;
 
  output_dims.n = 1;
  output_dims.h = output_shape.dims(1);
  output_dims.w = output_width;
  output_dims.c = depth;
 
  pool_params.stride.h = params.stride_height;
  pool_params.stride.w = params.stride_width;
  pool_params.padding.h = params.padding_values.height;
  pool_params.padding.w = params.padding_values.width;
  pool_params.activation.min = params.quantized_activation_min;
  pool_params.activation.max = params.quantized_activation_max;
 
  filter_dims.n = 1;
  filter_dims.h = params.filter_height;
  filter_dims.w = params.filter_width;
  filter_dims.c = 1;
 
  if (data_type == luci_interpreter::DataType::S8)
  {
    arm_max_pool_s8(&ctx, &pool_params, &input_dims,
                    luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
                    &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
  }
  else
  {
    arm_max_pool_s16(&ctx, &pool_params, &input_dims,
                     luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
                     &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
  }
}

Referenced by luci_interpreter::execute_kernel_CircleMaxPool2D().

◆ Mean() [1/2]

void luci_interpreter_pal::Mean	(	const MeanParams &	op_params,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		float *	output_data
	)

inline

Definition at line 167 of file PALMean.h.

{
  // Current implementation only supports dimension equals 4 and simultaneous
  // reduction over width and height.
  const luci_interpreter::RuntimeShape input_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
  const luci_interpreter::RuntimeShape output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
 
  const int output_batch = output_shape.dims(0);
  const int output_depth = output_shape.dims(3);
 
  const int input_height = input_shape.dims(1);
  const int input_width = input_shape.dims(2);
 
  for (int out_b = 0; out_b < output_batch; ++out_b)
  {
    for (int out_d = 0; out_d < output_depth; ++out_d)
    {
      float value = 0;
      for (int in_h = 0; in_h < input_height; ++in_h)
      {
        for (int in_w = 0; in_w < input_width; ++in_w)
        {
          value += input_data[offset(input_shape.dimsData(), out_b, in_h, in_w, out_d)];
        }
      }
      output_data[offset(output_shape.dimsData(), out_b, 0, 0, out_d)] =
        value / (input_width * input_height);
    }
  }
}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), and output_shape.

◆ Mean() [2/2]

template<typename T , typename U >

bool luci_interpreter_pal::Mean	(	const T *	input_data,
		const int *	input_dims,
		const int	input_num_dims,
		T *	output_data,
		const int *	output_dims,
		const int	output_num_dims,
		const int *	axis,
		const int	num_axis_dimensions,
		bool	,
		int *	temp_index,
		int *	resolved_axis,
		U *	temp_sum
	)

inline

Definition at line 108 of file PALMean.h.

{
  // Reset output data.
  size_t num_outputs = 1;
  for (int idx = 0; idx < output_num_dims; ++idx)
  {
    size_t current = static_cast<size_t>(output_dims[idx]);
    // Overflow prevention.
    if (num_outputs > std::numeric_limits<size_t>::max() / current)
    {
      return false;
    }
    num_outputs *= current;
  }
  for (size_t idx = 0; idx < num_outputs; ++idx)
  {
    output_data[idx] = T();
    temp_sum[idx] = U();
  }
 
  // Resolve axis.
  int num_resolved_axis = 0;
  if (!resolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, &num_resolved_axis))
  {
    return false;
  }
 
  if (!reduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, output_num_dims,
                           resolved_axis, num_resolved_axis, temp_index, temp_sum))
  {
    return false;
  }
 
  // Calculate mean by dividing output_data by num of aggregated element.
  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_resolved_axis; ++idx)
  {
    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
    // Overflow prevention.
    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis))
    {
      return false;
    }
    num_elements_in_axis *= current;
  }
 
  if (num_elements_in_axis > 0)
  {
    for (size_t idx = 0; idx < num_outputs; ++idx)
    {
      output_data[idx] = static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
    }
  }
  return true;
}

Referenced by luci_interpreter::execute_kernel_CircleMean().

◆ Minimum()

void luci_interpreter_pal::Minimum	(	const int	flat_size,
		const float *	input1_data,
		const float *	input2_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALMinimumCommon.h.

{
  BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
}

Referenced by luci_interpreter::execute_kernel_CircleMinimum().

◆ MirrorPad()

template<typename T >

void luci_interpreter_pal::MirrorPad	(	const luci_interpreter::DataType	padding_matrix_type,
		const uint8_t *	padding_matrix_data,
		const int32_t *	input_dims,
		int *	output_dims_num_elements,
		int *	input_dims_num_elements,
		const T *	input_data,
		T *	output_data,
		const int	offset,
		const int	num_dims,
		const int	output_size
	)

Definition at line 95 of file PALMirrorPad.h.

{
  for (int i = 0; i < output_size; ++i)
  {
    output_data[i] =
      input_data[getFlatIndex(i, num_dims, padding_matrix_type, padding_matrix_data, input_dims,
                              output_dims_num_elements, input_dims_num_elements, offset)];
  }
}

References offset().

Referenced by luci_interpreter::execute_kernel_CircleMirrorPad().

◆ Mul() [1/2]

template<typename T >

void luci_interpreter_pal::Mul	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 26 of file PALMulCommon.h.

{
  ArithmeticOp<T, MulFn<T>>(params, flat_size, input1_data, input2_data, output_data);
}

References Mul().

◆ Mul() [2/2]

template<>

void luci_interpreter_pal::Mul	(	tflite::ArithmeticParams &	params,
		const tflite::RuntimeShape &	input1_shape,
		const int64_t *	input1_data,
		const tflite::RuntimeShape &	input2_shape,
		const int64_t *	input2_data,
		const tflite::RuntimeShape &	output_shape,
		int64_t *	output_data
	)

inline

Definition at line 35 of file PALMul.h.

{
  tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
                                            input2_data, output_shape, output_data);
}

References Mul(), and output_shape.

◆ Mul< int16_t >() [1/2]

template<>

void luci_interpreter_pal::Mul< int16_t >	(	const ArithmeticParams &	,
		const int	,
		const int16_t *	,
		const int16_t *	,
		int16_t *
	)

inline

Definition at line 34 of file PALMul.h.

{
  assert(false && "Not IMPL yet");
}

◆ Mul< int16_t >() [2/2]

template<>

void luci_interpreter_pal::Mul< int16_t >	(	const ArithmeticParams &	params,
		const int	flat_size,
		const int16_t *	input1_data,
		const int16_t *	input2_data,
		int16_t *	output_data
	)

inline

Definition at line 38 of file PALMul.h.

{
  auto status = arm_elementwise_mul_s16(
    input1_data, input2_data, params.input1_offset, params.input2_offset, output_data,
    params.output_offset, params.output_multiplier, params.output_shift,
    params.quantized_activation_min, params.quantized_activation_max, flat_size);
  assert(status == ARM_CMSIS_NN_SUCCESS);
}

References luci_interpreter_pal::ArithmeticParams::input1_offset, luci_interpreter_pal::ArithmeticParams::input2_offset, luci_interpreter_pal::ArithmeticParams::output_multiplier, luci_interpreter_pal::ArithmeticParams::output_offset, luci_interpreter_pal::ArithmeticParams::output_shift, luci_interpreter_pal::ArithmeticParams::quantized_activation_max, and luci_interpreter_pal::ArithmeticParams::quantized_activation_min.

◆ Mul< int8_t >() [1/2]

template<>

void luci_interpreter_pal::Mul< int8_t >	(	const ArithmeticParams &	,
		const int	,
		const int8_t *	,
		const int8_t *	,
		int8_t *
	)

inline

Definition at line 27 of file PALMul.h.

{
  assert(false && "Not IMPL yet");
}

◆ Mul< int8_t >() [2/2]

template<>

void luci_interpreter_pal::Mul< int8_t >	(	const ArithmeticParams &	params,
		const int	flat_size,
		const int8_t *	input1_data,
		const int8_t *	input2_data,
		int8_t *	output_data
	)

inline

Definition at line 27 of file PALMul.h.

{
  auto status = arm_elementwise_mul_s8(
    input1_data, input2_data, params.input1_offset, params.input2_offset, output_data,
    params.output_offset, params.output_multiplier, params.output_shift,
    params.quantized_activation_min, params.quantized_activation_max, flat_size);
  assert(status == ARM_CMSIS_NN_SUCCESS);
}

References luci_interpreter_pal::ArithmeticParams::input1_offset, luci_interpreter_pal::ArithmeticParams::input2_offset, luci_interpreter_pal::ArithmeticParams::output_multiplier, luci_interpreter_pal::ArithmeticParams::output_offset, luci_interpreter_pal::ArithmeticParams::output_shift, luci_interpreter_pal::ArithmeticParams::quantized_activation_max, and luci_interpreter_pal::ArithmeticParams::quantized_activation_min.

◆ MulScalar()

template<typename T >

void luci_interpreter_pal::MulScalar	(	const ArithmeticParams &	params,
		const int	flat_size,
		const T *	input_data,
		const T	scalar_value,
		T *	output_data
	)

inline

Definition at line 33 of file PALMulCommon.h.

{
  ArithmeticOpScalar<T, MulFn<T>>(params, flat_size, input_data, scalar_value, output_data);
}

◆ multiplyByQuantizedMultiplier()

int32_t luci_interpreter_pal::multiplyByQuantizedMultiplier	(	int32_t	x,
		int32_t	quantized_multiplier,
		int	shift
	)

inline

Definition at line 77 of file PALUtils.h.

{
  int left_shift = shift > 0 ? shift : 0;
  int right_shift = shift > 0 ? 0 : -shift;
  return roundingDivideByPOT(
    saturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
}

References roundingDivideByPOT(), and saturatingRoundingDoublingHighMul().

Referenced by FullyConnected(), and luci_interpreter_pal::lstm_internal::mulElementwise().

◆ multiplyByQuantizedMultiplierSmallerThanOneExp()

int32_t luci_interpreter_pal::multiplyByQuantizedMultiplierSmallerThanOneExp	(	int32_t	x,
		int32_t	quantized_multiplier,
		int	left_shift
	)

inline

Definition at line 85 of file PALUtils.h.

{
  return roundingDivideByPOT(saturatingRoundingDoublingHighMul(x, quantized_multiplier),
                             -left_shift);
}

References roundingDivideByPOT(), and saturatingRoundingDoublingHighMul().

Referenced by BroadcastComparison4DSlowWithScaling(), and ComparisonWithScaling().

◆ NdArrayDescsForElementwiseBroadcast()

template<int N>

void luci_interpreter_pal::NdArrayDescsForElementwiseBroadcast	(	const luci_interpreter::RuntimeShape &	input0_shape,
		const luci_interpreter::RuntimeShape &	input1_shape,
		NdArrayDesc< N > *	desc0_out,
		NdArrayDesc< N > *	desc1_out
	)

inline

Definition at line 89 of file ProcessBroadcastShapes.h.

{
 
  auto extended_input0_shape = luci_interpreter::RuntimeShape::extendedShape(N, input0_shape);
  auto extended_input1_shape = luci_interpreter::RuntimeShape::extendedShape(N, input1_shape);
 
  // Copy dims to desc, calculating strides.
  copyDimsToDesc<N>(extended_input0_shape, desc0_out);
  copyDimsToDesc<N>(extended_input1_shape, desc1_out);
 
  // Walk over each dimension. If the extents are equal do nothing.
  // Otherwise, set the desc with extent 1 to have extent equal to the other and
  // stride 0.
  for (int i = 0; i < N; ++i)
  {
    const int extent0 = extended_input0_shape.dims(i);
    const int extent1 = extended_input1_shape.dims(i);
    if (extent0 != extent1)
    {
      if (extent0 == 1)
      {
        desc0_out->strides[i] = 0;
        desc0_out->extents[i] = extent1;
      }
      else
      {
        desc1_out->strides[i] = 0;
        desc1_out->extents[i] = extent0;
      }
    }
  }
}

References luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, and luci_interpreter_pal::NdArrayDesc< N >::strides.

Referenced by BroadcastArithmeticOp4DSlow(), BroadcastBinaryOp4DSlow(), BroadcastPrelu4DSlowFloat(), and BroadcastTISO4DSlow().

◆ NDOpsHelper()

template<int N, typename Calc >

void luci_interpreter_pal::NDOpsHelper	(	const NdArrayDesc< N > &	output,
		const Calc &	calc
	)

inline

Definition at line 82 of file ProcessBroadcastShapes.h.

{
  int indexes[N] = {0};
  NDOpsHelperImpl<N, 0, Calc>(output, calc, indexes);
}

◆ NDOpsHelperImpl() [1/2]

template<int N, int DIM, typename Calc >

std::enable_if< DIM==N-1, void >::type luci_interpreter_pal::NDOpsHelperImpl	(	const NdArrayDesc< N > &	output,
		const Calc &	calc,
		int	indexes[N]
	)

Definition at line 60 of file ProcessBroadcastShapes.h.

{
  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM])
  {
    calc(indexes);
  }
}

◆ NDOpsHelperImpl() [2/2]

template<int N, int DIM, typename Calc >

std::enable_if< DIM!=N-1, void >::type luci_interpreter_pal::NDOpsHelperImpl	(	const NdArrayDesc< N > &	output,
		const Calc &	calc,
		int	indexes[N]
	)

Definition at line 70 of file ProcessBroadcastShapes.h.

{
  for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM])
  {
    NDOpsHelperImpl<N, DIM + 1, Calc>(output, calc, indexes);
  }
}

◆ Negate()

template<typename T >

void luci_interpreter_pal::Negate	(	const luci_interpreter::RuntimeShape &	input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

inline

Definition at line 23 of file PALNeg.h.

{
  // check that input and output dimensions are equal
  int N = input_shape.dimensionsCount();
  assert(N == output_shape.dimensionsCount());
 
  // check that sizes of all dimensions are equal
  for (int i = 0; i < N; ++i)
  {
    assert(input_shape.dims(i) == output_shape.dims(i));
  }
 
  const int flat_size = input_shape.flatSize();
 
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = -input_data[i];
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), Negate(), and output_shape.

Referenced by Negate().

◆ nextIndex()

bool luci_interpreter_pal::nextIndex	(	const int	num_dims,
		const int *	dims,
		int *	current
	)

inline

Definition at line 148 of file PALUtils.h.

{
  if (num_dims == 0)
  {
    return false;
  }
  int carry = 1;
  for (int idx = num_dims - 1; idx >= 0; --idx)
  {
    int current_val = current[idx] + carry;
    if (dims[idx] == current_val)
    {
      current[idx] = 0;
    }
    else
    {
      current[idx] = current_val;
      carry = 0;
      break;
    }
  }
  return (carry == 0);
}

Referenced by ReduceGeneric().

◆ NotEqualFn()

template<typename T >

bool luci_interpreter_pal::NotEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 56 of file PALComparisons.h.

56{ return lhs != rhs; }

Referenced by luci_interpreter::execute_kernel_CircleNotEqual().

◆ offset() [1/2]

int luci_interpreter_pal::offset	(	const int32_t *	dims_data,
		int	i0,
		int	i1,
		int	i2,
		int	i3
	)

inline

Definition at line 193 of file PALUtils.h.

{
  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}

Referenced by BatchToSpaceND(), BroadcastPrelu4DSlowFloat(), DepthToSpace(), Floor(), GatherND(), getNearestNeighbor(), L2Pool(), Mean(), MirrorPad(), reducedOutputOffset(), SpaceToBatchND(), SpaceToDepth(), and TransposeConv().

◆ offset() [2/2]

int luci_interpreter_pal::offset	(	const int32_t *	dims_data,
		int	i0,
		int	i1,
		int	i2,
		int	i3,
		int	i4
	)

inline

Definition at line 198 of file PALUtils.h.

{
  return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) * dims_data[4] + i4;
}

◆ Offset()

int luci_interpreter_pal::Offset	(	const luci_interpreter::RuntimeShape &	shape,
		int	i0,
		int	i1,
		int	i2,
		int	i3
	)

inline

Definition at line 27 of file PALResizeBilinear.h.

{
  assert(shape.dimensionsCount() == 4);
 
  const int32_t *dims_data = reinterpret_cast<const int32_t *>(shape.dimsData());
  LUCI_INTERPRETER_CHECK(i0 >= 0 && i0 < dims_data[0]);
  LUCI_INTERPRETER_CHECK(i1 >= 0 && i1 < dims_data[1]);
  LUCI_INTERPRETER_CHECK(i2 >= 0 && i2 < dims_data[2]);
  LUCI_INTERPRETER_CHECK(i3 >= 0 && i3 < dims_data[3]);
  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), and LUCI_INTERPRETER_CHECK.

◆ Pad()

void luci_interpreter_pal::Pad	(	const PadParams &	op_params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const float *	pad_value_ptr,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

Definition at line 28 of file PALPad.h.

{
  // Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
  // pad them to 5 dims (yes, we are "padding the padding").
  int left_padding_copy[PadKernelMaxDimensionCount()];
  for (int i = 0; i < PadKernelMaxDimensionCount(); i++)
  {
    left_padding_copy[i] = 0;
  }
  for (int i = 0; i < op_params.left_padding_count; ++i)
  {
    left_padding_copy[i + PadKernelMaxDimensionCount() - op_params.left_padding_count] =
      op_params.left_padding[i];
  }
  int right_padding_copy[PadKernelMaxDimensionCount()];
  for (int i = 0; i < PadKernelMaxDimensionCount(); i++)
  {
    right_padding_copy[i] = 0;
  }
  for (int i = 0; i < op_params.right_padding_count; ++i)
  {
    right_padding_copy[i + PadKernelMaxDimensionCount() - op_params.right_padding_count] =
      op_params.right_padding[i];
  }
  const auto extended_output =
    luci_interpreter::RuntimeShape::extendedShape(PadKernelMaxDimensionCount(), output_shape);
  const int output_batch = extended_output.dims(0);
  const int output_plane = extended_output.dims(1);
  const int output_height = extended_output.dims(2);
  const int output_width = extended_output.dims(3);
  const int output_depth = extended_output.dims(4);
 
  const int left_b_padding = left_padding_copy[0];
  const int left_p_padding = left_padding_copy[1];
  const int left_h_padding = left_padding_copy[2];
  const int left_w_padding = left_padding_copy[3];
  const int left_d_padding = left_padding_copy[4];
 
  const int right_b_padding = right_padding_copy[0];
  const int right_p_padding = right_padding_copy[1];
  const int right_h_padding = right_padding_copy[2];
  const int right_w_padding = right_padding_copy[3];
  const int right_d_padding = right_padding_copy[4];
 
  const float pad_value = *pad_value_ptr;
 
  const float *in_ptr = input_data;
  float *out_ptr = output_data;
  for (int out_b = 0; out_b < output_batch; ++out_b)
  {
    for (int out_p = 0; out_p < output_plane; ++out_p)
    {
      for (int out_h = 0; out_h < output_height; ++out_h)
      {
        for (int out_w = 0; out_w < output_width; ++out_w)
        {
          for (int out_d = 0; out_d < output_depth; ++out_d)
          {
            if (out_b < left_b_padding || out_b >= output_batch - right_b_padding ||
                out_p < left_p_padding || out_p >= output_plane - right_p_padding ||
                out_h < left_h_padding || out_h >= output_height - right_h_padding ||
                out_w < left_w_padding || out_w >= output_width - right_w_padding ||
                out_d < left_d_padding || out_d >= output_depth - right_d_padding)
            {
              *out_ptr++ = pad_value;
            }
            else
            {
              *out_ptr++ = *in_ptr++;
            }
          }
        }
      }
    }
  }
}

References luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::PadParams::left_padding, luci_interpreter_pal::PadParams::left_padding_count, output_shape, PadKernelMaxDimensionCount(), luci_interpreter_pal::PadParams::right_padding, and luci_interpreter_pal::PadParams::right_padding_count.

Referenced by luci_interpreter::execute_kernel_CirclePadCommon().

◆ PadKernelMaxDimensionCount()

constexpr int luci_interpreter_pal::PadKernelMaxDimensionCount ( )

constexpr

Definition at line 26 of file PALPad.h.

26{ return 5; }

Referenced by Pad().

◆ ProcessBroadcastShapes()

bool luci_interpreter_pal::ProcessBroadcastShapes	(	const luci_interpreter::RuntimeShape &	shape0,
		const luci_interpreter::RuntimeShape &	shape1,
		luci_interpreter_pal::ArithmeticParams *	params
	)

inline

Definition at line 150 of file ProcessBroadcastShapes.h.

{
  const int dims_count = std::max(shape0.dimensionsCount(), shape1.dimensionsCount());
 
  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
 
  auto extended_shape0 = luci_interpreter::RuntimeShape::extendedShape(dims_count, shape0);
  auto extended_shape1 = luci_interpreter::RuntimeShape::extendedShape(dims_count, shape1);
 
  // Check for "exact" match, implicitly accepting any scalar shapes.
  if (extended_shape0 == extended_shape1)
  {
    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
    return false;
  }
 
  if (shape0.flatSize() == 1)
  {
    params->broadcast_category = BroadcastableOpCategory::kScalarFirstBroadcast;
    return true;
  }
  else if (shape1.flatSize() == 1)
  {
    params->broadcast_category = BroadcastableOpCategory::kScalarSecondBroadcast;
    return true;
  }
 
  for (int i = dims_count - 1; i >= 0; --i)
  {
    if (extended_shape0.dims(i) == extended_shape1.dims(i))
    {
      continue;
    }
    else if (extended_shape0.dims(i) == 1)
    {
      params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast;
      return true;
    }
    else if (extended_shape1.dims(i) == 1)
    {
      params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast;
      return true;
    }
    else
    {
      // This case is erroneous: there is a dimension that does not match and
      // is not a broadcast from one shape to the other.
      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
      return true;
    }
  }
 
  return false;
}

References luci_interpreter_pal::ArithmeticParams::broadcast_category, luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter::RuntimeShape::flatSize(), kFirstInputBroadcastsFast, kGenericBroadcast, kNonBroadcast, kScalarFirstBroadcast, kScalarSecondBroadcast, and kSecondInputBroadcastsFast.

Referenced by luci_interpreter::kernels::evalTISOKernel(), and luci_interpreter::kernels::evalTISOQuantizedKernel().

◆ Quantize()

template<typename InputT , typename OutputT >

void luci_interpreter_pal::Quantize	(	const QuantizationParams &	op_params,
		const int	flat_size,
		const InputT *	input_data,
		OutputT *	output_data
	)

inline

Definition at line 27 of file PALQuantize.h.

{
  const int32_t zero_point = op_params.zero_point;
  const double scale = op_params.scale;
  static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
  static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
 
  for (int i = 0; i < flat_size; i++)
  {
    const InputT val = input_data[i];
    int32_t unclamped =
      static_cast<int32_t>(std::round(val / static_cast<float>(scale))) + zero_point;
    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
}

References luci_interpreter_pal::QuantizationParams::scale, and luci_interpreter_pal::QuantizationParams::zero_point.

◆ reducedOutputOffset()

size_t luci_interpreter_pal::reducedOutputOffset	(	const int	num_dims,
		const int *	dims,
		const int *	index,
		const int	num_axis,
		const int *	axis
	)

inline

Definition at line 116 of file PALUtils.h.

{
  if (num_dims == 0)
  {
    return 0;
  }
  size_t offset = 0;
  for (int idx = 0; idx < num_dims; ++idx)
  {
    // if we need to skip this axis
    bool is_axis = false;
    if (axis != nullptr)
    {
      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
      {
        if (idx == axis[axis_idx])
        {
          is_axis = true;
          break;
        }
      }
    }
    if (!is_axis)
    {
      offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
    }
  }
  return offset;
}

References offset().

Referenced by ReduceGeneric().

◆ ReduceGeneric()

template<typename T >

void luci_interpreter_pal::ReduceGeneric	(	const T *	input_data,
		const int *	input_dims,
		const int	input_num_dims,
		T *	output_data,
		const int *	axis,
		const int64_t	num_axis_dimensions,
		T	init_value,
		const int	output_flat_size,
		T	reducerconst T, const T
	)

inline

Definition at line 73 of file PALReduceCommon.h.

{
  // Return early when input shape has zero dim.
  for (int i = 0; i < input_num_dims; ++i)
  {
    if (input_dims[i] == 0)
      return;
  }
 
  for (size_t idx = 0; idx < output_flat_size; ++idx)
  {
    output_data[idx] = init_value;
  }
 
  // Resolve axis.
  int num_resolved_axis = 0;
  if (!resolveAxis(input_num_dims, axis, num_axis_dimensions, &num_resolved_axis))
  {
    return;
  }
 
  int temp_index[5];
  // Reset input iterator.
  for (int idx = 0; idx < input_num_dims; ++idx)
  {
    temp_index[idx] = 0;
  }
  // Iterate through input_data.
  do
  {
    size_t input_offset = reducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
    size_t output_offset =
      reducedOutputOffset(input_num_dims, input_dims, temp_index, num_resolved_axis, axis);
    output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
  } while (nextIndex(input_num_dims, input_dims, temp_index));
}

References nextIndex(), and reducedOutputOffset().

◆ ReLUCommon()

void luci_interpreter_pal::ReLUCommon	(	const int	flat_size,
		const float *	input_data,
		float *	output_data,
		const float	alpha,
		const bool	is_relu_6
	)

inline

Definition at line 26 of file PALReluCommon.h.

{
  const float relu_6_value = 6.0f;
  for (int i = 0; i < flat_size; i++)
  {
    const float val = input_data[i];
    float result = val > 0 ? val : val * alpha;
    result = is_relu_6 ? (result > relu_6_value ? relu_6_value : result) : result;
    output_data[i] = result;
  }
}

Referenced by luci_interpreter::execute_kernel_CircleLeakyRelu(), luci_interpreter::execute_kernel_CircleRelu(), and luci_interpreter::execute_kernel_CircleRelu6().

◆ ResizeNearestNeighbor()

template<typename T >

void luci_interpreter_pal::ResizeNearestNeighbor	(	const ResizeNearestNeighborParams &	op_params,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	output_size_shape,
		const int32_t *	output_size_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 47 of file PALResizeNearestNeighbor.h.

{
  const luci_interpreter::RuntimeShape input_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
  const luci_interpreter::RuntimeShape output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
 
  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
  int32_t input_height = input_shape.dims(1);
  int32_t input_width = input_shape.dims(2);
  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
  int32_t output_height = output_size_data[0];
  int32_t output_width = output_size_data[1];
 
  const int col_offset = input_shape.dims(3);
  const int row_offset = input_shape.dims(2) * col_offset;
  const int batch_offset = input_shape.dims(1) * row_offset;
 
  const T *input_ptr = input_data;
  T *output_ptr = output_data;
  for (int b = 0; b < batches; ++b)
  {
    for (int y = 0; y < output_height; ++y)
    {
      int32_t in_y = getNearestNeighbor(y, input_height, output_height, op_params.align_corners,
                                        op_params.half_pixel_centers);
      const T *y_input_ptr = input_ptr + in_y * row_offset;
      for (int x = 0; x < output_width; ++x)
      {
        int32_t in_x = getNearestNeighbor(x, input_width, output_width, op_params.align_corners,
                                          op_params.half_pixel_centers);
        const T *x_input_ptr = y_input_ptr + in_x * col_offset;
        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
        output_ptr += depth;
      }
    }
    input_ptr += batch_offset;
  }
}

References luci_interpreter_pal::ResizeNearestNeighborParams::align_corners, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), getNearestNeighbor(), luci_interpreter_pal::ResizeNearestNeighborParams::half_pixel_centers, MatchingDim(), output_shape, and ResizeNearestNeighbor().

Referenced by ResizeNearestNeighbor().

◆ Round()

void luci_interpreter_pal::Round	(	const int32_t	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 37 of file PALRound.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    // Note that this implementation matches that of tensorFlow tf.round
    // and corresponds to the bankers rounding method.
    // cfenv (for fesetround) is not yet supported universally on Android, so
    // using a work around.
    output_data[i] = RoundToNearest(input_data[i]);
  }
}

References RoundToNearest().

Referenced by luci_interpreter::execute_kernel_CircleRound().

◆ roundingDivideByPOT()

int32_t luci_interpreter_pal::roundingDivideByPOT	(	int32_t	x,
		int32_t	exponent
	)

inline

Definition at line 65 of file PALUtils.h.

{
  assert(exponent >= 0);
  assert(exponent <= 31);
  const int32_t mask = int32_t((1ll << exponent) - 1);
  const int32_t zero = int32_t(0);
  const int32_t one = int32_t(1);
  const int32_t remainder = x & mask;
  const int32_t threshold = (mask >> 1) + ((x < zero ? one : zero) & one);
  return (x >> exponent) + ((remainder > threshold ? one : zero) & one);
}

Referenced by multiplyByQuantizedMultiplier(), and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ RoundToNearest()

float luci_interpreter_pal::RoundToNearest ( float value )

inline

Definition at line 23 of file PALRound.h.

{
  auto floor_val = std::floor(value);
  auto diff = value - floor_val;
  if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0)))
  {
    return floor_val;
  }
  else
  {
    return floor_val + 1.0f;
  }
}

Referenced by Round().

◆ Rsqrt()

void luci_interpreter_pal::Rsqrt	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 27 of file PALRsqrt.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = 1.f / std::sqrt(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleRsqrt().

◆ saturatingRoundingDoublingHighMul()

std::int32_t luci_interpreter_pal::saturatingRoundingDoublingHighMul	(	std::int32_t	a,
		std::int32_t	b
	)

inline

Definition at line 52 of file PALUtils.h.

{
  bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
  std::int64_t a_64(a);
  std::int64_t b_64(b);
  std::int64_t ab_64 = a_64 * b_64;
  std::int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
  std::int32_t ab_x2_high32 = static_cast<std::int32_t>((ab_64 + nudge) / (1ll << 31));
  return overflow ? std::numeric_limits<std::int32_t>::max() : ab_x2_high32;
}

Referenced by multiplyByQuantizedMultiplier(), and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ Select()

template<typename D , typename T >

void luci_interpreter_pal::Select	(	const luci_interpreter::RuntimeShape &	input_condition_shape,
		const D *	input_condition_data,
		const luci_interpreter::RuntimeShape &	input_x_shape,
		const T *	input_x_data,
		const luci_interpreter::RuntimeShape &	input_y_shape,
		const T *	input_y_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		T *	output_data
	)

Definition at line 27 of file PALSelectV2.h.

{
  int64_t flatsize;
  // Allow select operator executions on mixed scalar tensors and one element
  // tensors.
  if (input_condition_shape.flatSize() == 1 && input_x_shape.flatSize() == 1 &&
      input_y_shape.flatSize() == 1 && output_shape.flatSize() == 1)
  {
    flatsize = 1;
  }
  else
  {
    flatsize = input_condition_shape.flatSize();
  }
  for (int64_t i = 0; i < flatsize; ++i)
  {
    output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
  }
}

References luci_interpreter::RuntimeShape::flatSize(), and output_shape.

◆ Sin()

void luci_interpreter_pal::Sin	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 27 of file PALSinCommon.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::sin(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleSin().

◆ Softmax() [1/4]

void luci_interpreter_pal::Softmax	(	const SoftmaxParams &	params,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 25 of file PALSoftmaxCommon.h.

{
  const int outer_size = params.num_rows;
  const int depth = params.row_size;
  const double beta = params.beta;
 
  for (int i = 0; i < outer_size; ++i)
  {
    // Find max element value which we'll use to ensure numerical stability
    // taking advantage of the following equality:
    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
    float max = std::numeric_limits<float>::lowest();
    for (int c = 0; c < depth; ++c)
    {
      max = std::max(max, input_data[i * depth + c]);
    }
 
    // Compute sum.
    float sum = 0.f;
    for (int c = 0; c < depth; ++c)
    {
      const float exp_c = std::exp((input_data[i * depth + c] - max) * static_cast<float>(beta));
      output_data[i * depth + c] = exp_c;
      sum += exp_c;
    }
 
    // Compute result.
    for (int c = 0; c < depth; ++c)
    {
      output_data[i * depth + c] = output_data[i * depth + c] / sum;
    }
  }
}

References luci_interpreter_pal::SoftmaxParams::beta, luci_interpreter_pal::SoftmaxParams::num_rows, luci_interpreter_pal::SoftmaxParams::row_size, and Softmax().

◆ Softmax() [2/4]

void luci_interpreter_pal::Softmax	(	const SoftmaxParams &	params,
		const int16_t *	input_data,
		int16_t *	output_data
	)

inline

Definition at line 116 of file PALSoftmax.h.

{
  cmsis_nn_softmax_lut_s16 softmax_params{};
 
  auto raw_exp_lut = std::make_unique<int16_t[]>(kInt16LUTArraySize);
  auto one_over_one_plus_x_lut = std::make_unique<int16_t[]>(kInt16LUTArraySize);
 
  // exp LUT only used on negative values
  // we consider exp(-10.0) is insignificant to accumulation
  const int32_t range = std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min();
 
  LUTPopulate<int16_t>(
    10.0f / range, std::numeric_limits<int16_t>::max(), 2.0f / range, 0,
    [](float value) { return std::exp(value); }, raw_exp_lut.get());
 
  LUTPopulate<int16_t>(
    1.0f / range, std::numeric_limits<int16_t>::min(), 2.0f / range, 0,
    [](float value) { return 1.0f / (1.0f + value); }, one_over_one_plus_x_lut.get());
 
  softmax_params.exp_lut = raw_exp_lut.get();
  softmax_params.one_by_one_lut = one_over_one_plus_x_lut.get();
 
  arm_softmax_s16(input_data, params.num_rows, params.row_size, params.input_multiplier,
                  params.input_left_shift, &softmax_params, output_data);
}

References luci_interpreter_pal::SoftmaxParams::input_left_shift, luci_interpreter_pal::SoftmaxParams::input_multiplier, luci_interpreter_pal::SoftmaxParams::num_rows, luci_interpreter_pal::SoftmaxParams::row_size, and Softmax().

◆ Softmax() [3/4]

void luci_interpreter_pal::Softmax	(	const SoftmaxParams &	params,
		const int8_t *	input_data,
		int16_t *	output_data
	)

inline

Definition at line 110 of file PALSoftmax.h.

{
  arm_softmax_s8_s16(input_data, params.num_rows, params.row_size, params.input_multiplier,
                     params.input_left_shift, params.diff_min, output_data);
}

References luci_interpreter_pal::SoftmaxParams::diff_min, luci_interpreter_pal::SoftmaxParams::input_left_shift, luci_interpreter_pal::SoftmaxParams::input_multiplier, luci_interpreter_pal::SoftmaxParams::num_rows, luci_interpreter_pal::SoftmaxParams::row_size, and Softmax().

◆ Softmax() [4/4]

void luci_interpreter_pal::Softmax	(	const SoftmaxParams &	params,
		const int8_t *	input_data,
		int8_t *	output_data
	)

inline

Definition at line 104 of file PALSoftmax.h.

{
  arm_softmax_s8(input_data, params.num_rows, params.row_size, params.input_multiplier,
                 params.input_left_shift, params.diff_min, output_data);
}

References luci_interpreter_pal::SoftmaxParams::diff_min, luci_interpreter_pal::SoftmaxParams::input_left_shift, luci_interpreter_pal::SoftmaxParams::input_multiplier, luci_interpreter_pal::SoftmaxParams::num_rows, luci_interpreter_pal::SoftmaxParams::row_size, and Softmax().

◆ Softmax< int8_t >()

template<>

void luci_interpreter_pal::Softmax< int8_t >	(	const tflite::SoftmaxParams &	params,
		const tflite::RuntimeShape &	input_shape,
		const int8_t *	input_data,
		const tflite::RuntimeShape &	output_shape,
		int8_t *	output_data
	)

inline

Definition at line 63 of file PALSoftmax.h.

{
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  const int32_t mult = params.input_multiplier;
  const int32_t shift = params.input_left_shift;
  const int32_t diff_min = params.diff_min;
 
  arm_softmax_s8(input_data, outer_size, depth, mult, shift, diff_min, output_data);
}

References output_shape.

◆ SpaceToBatchND()

template<typename T >

void luci_interpreter_pal::SpaceToBatchND	(	const int32_t	pad_value,
		const luci_interpreter::RuntimeShape &	unextended_input1_shape,
		const T *	input1_data,
		const luci_interpreter::RuntimeShape &	unextended_input2_shape,
		const int32_t *	block_shape_data,
		const luci_interpreter::RuntimeShape &	unextended_input3_shape,
		const int32_t *	paddings_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 46 of file PALSpaceToBatchND.h.

{
  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
  const luci_interpreter::RuntimeShape input1_shape =
    extendShapeSpaceToBatch(unextended_input1_shape);
  const luci_interpreter::RuntimeShape output_shape =
    extendShapeSpaceToBatch(unextended_output_shape);
 
  const int depth = input1_shape.dims(3);
  const int input_width = input1_shape.dims(2);
  const int input_height = input1_shape.dims(1);
  const int input_batch_size = input1_shape.dims(0);
 
  const int output_width = output_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_batch_size = output_shape.dims(0);
 
  const int block_shape_height = block_shape_data[0];
  const int block_shape_width =
    unextended_input1_shape.dimensionsCount() == 4 ? block_shape_data[1] : 1;
  const int padding_top = paddings_data[0];
  const int padding_left = unextended_input1_shape.dimensionsCount() == 4 ? paddings_data[2] : 0;
 
  for (int out_b = 0; out_b < output_batch_size; ++out_b)
  {
    int input_batch = out_b % input_batch_size;
    int shift_w = (out_b / input_batch_size) % block_shape_width;
    int shift_h = (out_b / input_batch_size) / block_shape_width;
    for (int out_h = 0; out_h < output_height; ++out_h)
    {
      for (int out_w = 0; out_w < output_width; ++out_w)
      {
        T *out = output_data + offset(output_shape.dimsData(), out_b, out_h, out_w, 0);
        if (out_h * block_shape_height + shift_h < padding_top ||
            out_h * block_shape_height + shift_h >= padding_top + input_height ||
            out_w * block_shape_width + shift_w < padding_left ||
            out_w * block_shape_width + shift_w >= padding_left + input_width)
        {
          // This may not execute correctly when pad_value != 0 and T != uint8.
          memset(out, pad_value, depth * sizeof(T));
        }
        else
        {
          const T *in =
            input1_data + offset(input1_shape.dimsData(), input_batch,
                                 (out_h * block_shape_height + shift_h) - padding_top,
                                 (out_w * block_shape_width + shift_w) - padding_left, 0);
          memcpy(out, in, depth * sizeof(T));
        }
      }
    }
  }
}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), offset(), output_shape, and SpaceToBatchND().

Referenced by SpaceToBatchND().

◆ SpaceToDepth()

template<typename T >

void luci_interpreter_pal::SpaceToDepth	(	const int32_t	block_size,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 29 of file PALSpaceToDepth.h.

{
  const luci_interpreter::RuntimeShape input_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
  const luci_interpreter::RuntimeShape output_shape =
    luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
 
  const int input_depth = input_shape.dims(3);
  const int input_width = input_shape.dims(2);
  const int input_height = input_shape.dims(1);
  const int input_batch = input_shape.dims(0);
 
  for (int in_b = 0; in_b < input_batch; ++in_b)
  {
    for (int in_h = 0; in_h < input_height; ++in_h)
    {
      for (int in_w = 0; in_w < input_width; ++in_w)
      {
        for (int in_d = 0; in_d < input_depth; ++in_d)
        {
          const int out_d =
            in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
          const int out_w = in_w / block_size;
          const int out_h = in_h / block_size;
          const int out_b = in_b;
 
          const int input_index = offset(input_shape.dimsData(), in_b, in_h, in_w, in_d);
          const int output_index = offset(output_shape.dimsData(), out_b, out_h, out_w, out_d);
 
          output_data[output_index] = input_data[input_index];
        }
      }
    }
  }
}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), output_shape, and SpaceToDepth().

Referenced by SpaceToDepth().

◆ Sqrt()

void luci_interpreter_pal::Sqrt	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 27 of file PALSqrt.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::sqrt(input_data[i]);
  }
}

Referenced by luci_interpreter::execute_kernel_CircleSqrt().

◆ Square()

void luci_interpreter_pal::Square	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 26 of file PALSquareCommon.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = input_data[i] * input_data[i];
  }
}

Referenced by luci_interpreter::execute_kernel_CircleSquare().

◆ SquaredDifference()

void luci_interpreter_pal::SquaredDifference	(	const int	flat_size,
		const float *	input_data_1,
		const float *	input_data_2,
		float *	output_data
	)

inline

Definition at line 27 of file PALSquaredDifference.h.

{
  for (int i = 0; i < flat_size; ++i)
  {
    float diff = input_data_1[i] - input_data_2[i];
    output_data[i] = diff * diff;
  }
}

Referenced by luci_interpreter::execute_kernel_CircleSquaredDifference().

◆ StridedSlice()

template<typename T >

void luci_interpreter_pal::StridedSlice	(	StridedSliceParams &	op_params,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		T *	output_data
	)

inline

Definition at line 205 of file PALStridedSlice.h.

{
  const luci_interpreter::RuntimeShape input_shape =
    luci_interpreter::RuntimeShape::extendedShape(5, unextended_input_shape);
 
  // Reverse and pad to 5 dimensions because that is what the runtime code
  // requires (ie. all shapes must be 5D and are given backwards).
  stridedSlicePadIndices(&op_params, 5);
 
  const int start_0 = startForAxis(op_params, input_shape, 0);
  const int stop_0 = stopForAxis(op_params, input_shape, 0, start_0);
  const int start_1 = startForAxis(op_params, input_shape, 1);
  const int stop_1 = stopForAxis(op_params, input_shape, 1, start_1);
  const int start_2 = startForAxis(op_params, input_shape, 2);
  const int stop_2 = stopForAxis(op_params, input_shape, 2, start_2);
  const int start_3 = startForAxis(op_params, input_shape, 3);
  const int stop_3 = stopForAxis(op_params, input_shape, 3, start_3);
  const int start_4 = startForAxis(op_params, input_shape, 4);
  const int stop_4 = stopForAxis(op_params, input_shape, 4, start_4);
 
  for (int offset_0 = start_0 * input_shape.dims(1), end_0 = stop_0 * input_shape.dims(1),
           step_0 = op_params.strides[0] * input_shape.dims(1);
       !loopCondition(offset_0, end_0, op_params.strides[0]); offset_0 += step_0)
  {
    for (int offset_1 = (offset_0 + start_1) * input_shape.dims(2),
             end_1 = (offset_0 + stop_1) * input_shape.dims(2),
             step_1 = op_params.strides[1] * input_shape.dims(2);
         !loopCondition(offset_1, end_1, op_params.strides[1]); offset_1 += step_1)
    {
      for (int offset_2 = (offset_1 + start_2) * input_shape.dims(3),
               end_2 = (offset_1 + stop_2) * input_shape.dims(3),
               step_2 = op_params.strides[2] * input_shape.dims(3);
           !loopCondition(offset_2, end_2, op_params.strides[2]); offset_2 += step_2)
      {
        for (int offset_3 = (offset_2 + start_3) * input_shape.dims(4),
                 end_3 = (offset_2 + stop_3) * input_shape.dims(4),
                 step_3 = op_params.strides[3] * input_shape.dims(4);
             !loopCondition(offset_3, end_3, op_params.strides[3]); offset_3 += step_3)
        {
          for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
               !loopCondition(offset_4, end_4, op_params.strides[4]);
               offset_4 += op_params.strides[4])
          {
            *output_data++ = input_data[offset_4];
          }
        }
      }
    }
  }
}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), and luci_interpreter_pal::StridedSliceParams::strides.

Referenced by luci_interpreter::execute_kernel_CircleStridedSlice().

◆ subscriptToIndex() [1/2]

int luci_interpreter_pal::subscriptToIndex	(	const NdArrayDesc< 4 > &	desc,
		int	i0,
		int	i1,
		int	i2,
		int	i3
	)

inline

Definition at line 125 of file ProcessBroadcastShapes.h.

{
  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
}

References luci_interpreter_pal::NdArrayDesc< N >::strides.

Referenced by BroadcastArithmeticOp4DSlow(), BroadcastBinaryOp4DSlow(), BroadcastComparison4DSlowNoScaling(), BroadcastComparison4DSlowWithScaling(), BroadcastImpl(), BroadcastPrelu4DSlowFloat(), BroadcastTISO4DSlow(), and TransposeImpl().

◆ subscriptToIndex() [2/2]

int luci_interpreter_pal::subscriptToIndex	(	const NdArrayDesc< 5 > &	desc,
		int	indexes[5]
	)

inline

Definition at line 130 of file ProcessBroadcastShapes.h.

{
  return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
         indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] + indexes[4] * desc.strides[4];
}

References luci_interpreter_pal::NdArrayDesc< N >::strides.

◆ SVDF()

void luci_interpreter_pal::SVDF	(	const float *	input_data,
		const float *	weights_feature_data,
		const float *	weights_time_data,
		const float *	bias_data,
		float *	state_data,
		float *	scratch_data,
		float *	output_data,
		const int	rank,
		const int	input_size,
		const int	batch_size,
		const int	num_filters,
		const int	num_units,
		const int	memory_size,
		const circle::ActivationFunctionType	activation
	)

inline

Definition at line 133 of file PALSVDFCommon.h.

{
  // Left shift the activation_state.
  {
    float *new_state_start = state_data;
    const float *old_state_start = state_data + 1;
    const float *old_state_end = state_data + batch_size * num_filters * memory_size;
    while (old_state_start != old_state_end)
    {
      *new_state_start++ = *old_state_start++;
    }
  }
 
  // Note: no need to clear the latest activation, matmul is not accumulative.
 
  // Compute conv1d(inputs, weights_feature).
  // The activation_state's rightmost column is used to save current cycle
  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
  // having the stride equal to memory_size.
 
  // Perform batched matrix vector multiply operation:
  {
    const float *matrix = weights_feature_data;
    const float *vector = input_data;
    float *result = &state_data[memory_size - 1];
    float *result_in_batch = result;
    for (int i = 0; i < batch_size; ++i)
    {
      const float *matrix_ptr = matrix;
      for (int j = 0; j < num_filters; ++j)
      {
        float dot_prod = 0.0f;
        const float *vector_in_batch = vector + i * input_size;
        for (int k = 0; k < input_size; ++k)
        {
          dot_prod += *matrix_ptr++ * *vector_in_batch++;
        }
        *result_in_batch = dot_prod;
        result_in_batch += memory_size;
      }
    }
  }
 
  applyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters, num_units, rank,
                                    weights_time_data, bias_data, activation, state_data,
                                    scratch_data, output_data);
}

Referenced by luci_interpreter::execute_kernel_CircleSVDF().

◆ Tanh() [1/2]

void luci_interpreter_pal::Tanh	(	const int	flat_size,
		const float *	input_data,
		float *	output_data
	)

inline

Definition at line 26 of file PALTanh.h.

{
  for (int i = 0; i < flat_size; i++)
  {
    float val = input_data[i];
    float result = std::tanh(val);
    output_data[i] = result;
  }
}

Referenced by luci_interpreter::evalInteger(), luci_interpreter::execute_kernel_CircleTanh(), luci_interpreter_pal::lstm_internal::tanh(), and luci_interpreter_pal::lstm_internal::tanh().

◆ Tanh() [2/2]

void luci_interpreter_pal::Tanh	(	int32_t	input_multiplier,
		int32_t	input_left_shift,
		const int	flat_size,
		const int16_t *	ptr_input_data,
		int16_t *	ptr_output_data
	)

inline

Definition at line 36 of file PALTanh.h.

{
  // We use the LUT for sigmoid and take into account, that
  // tanh(x) = 2*sigmoid(2*x) - 1
 
  // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
  // In case of general parameter scale, multiplier 3 is taken into account
  // in TanhPrepare function and it is included in
  // input_multiplier already.
 
  if (input_multiplier == 0)
  { // power of two case
    input_multiplier = 3 << input_left_shift;
    input_left_shift = 0;
  }
 
  int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
 
  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++)
  {
    int32_t input_data = ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
 
    uint32_t abs_input_data = abs(input_data);
    uint32_t uh = abs_input_data >> 8;
    int32_t result;
 
    if (uh >= 255)
    {
      // Saturate to maximum.
      result = 0xFFFF << 8;
    }
    else
    {
      uint32_t ua = sigmoid_table_uint16[uh];
      uint32_t ub = sigmoid_table_uint16[uh + 1];
 
      uint8_t ut = abs_input_data & 0xFF;
 
      result = (ua << 8) + ut * (ub - ua);
    }
 
    result = (input_data >= 0) ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
                               : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
 
    // Convert back to 16-bit.
    result >>= (9 - 1);
 
    *ptr_output_data = result;
  }
}

◆ Transpose()

template<typename T , int N = 5>

void luci_interpreter_pal::Transpose	(	const TransposeParams &	params,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

Definition at line 70 of file PALTranspose.h.

{
  // Transpose kernel only does rearranging values not numeric evaluations on
  // each cell. It's safe to implement per size of scalar type and this trick
  // keeps the total code size in a reasonable range.
  switch (sizeof(T))
  {
    case 1:
      TransposeImpl<int8_t, N>(params, unextended_input_shape,
                               reinterpret_cast<const int8_t *>(input_data),
                               unextended_output_shape, reinterpret_cast<int8_t *>(output_data));
      break;
    case 2:
      TransposeImpl<int16_t, N>(params, unextended_input_shape,
                                reinterpret_cast<const int16_t *>(input_data),
                                unextended_output_shape, reinterpret_cast<int16_t *>(output_data));
      break;
 
    case 4:
      TransposeImpl<int32_t, N>(params, unextended_input_shape,
                                reinterpret_cast<const int32_t *>(input_data),
                                unextended_output_shape, reinterpret_cast<int32_t *>(output_data));
      break;
    case 8:
      TransposeImpl<int64_t, N>(params, unextended_input_shape,
                                reinterpret_cast<const int64_t *>(input_data),
                                unextended_output_shape, reinterpret_cast<int64_t *>(output_data));
      break;
  }
}

Referenced by luci_interpreter::execute_kernel_CircleTranspose().

◆ TransposeConv()

void luci_interpreter_pal::TransposeConv	(	const ConvParams &	params,
		const luci_interpreter::RuntimeShape &	input_shape,
		const float *	input_data,
		const luci_interpreter::RuntimeShape &	filter_shape,
		const float *	filter_data,
		const luci_interpreter::RuntimeShape &	bias_shape,
		const float *	bias_data,
		const luci_interpreter::RuntimeShape &	output_shape,
		float *	output_data
	)

inline

Definition at line 26 of file PALTransposeConv.h.

{
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
 
  const int batches = input_shape.dims(0);
  const int input_depth = input_shape.dims(3);
  const int output_depth = filter_shape.dims(0);
  const int input_height = input_shape.dims(1);
  const int input_width = input_shape.dims(2);
  const int filter_height = filter_shape.dims(1);
  const int filter_width = filter_shape.dims(2);
  const int output_height = output_shape.dims(1);
  const int output_width = output_shape.dims(2);
  const float output_activation_min = params.float_activation_min;
  const float output_activation_max = params.float_activation_max;
 
  // Although transpose convolution simplifies to convolution with transposed
  // weights for strides of 1, non-unitary striding complicates matters. To
  // keep this reference implementation as clear as possible, we use a
  // "scatter" access pattern, where we loop through all the input elements,
  // computing their influence on the output, rather than looping through the
  // output elements in the typical "gather" access pattern of a conv. We
  // therefore must initialize the output array to zero.
  const int num_elements = output_shape.flatSize();
  for (int i = 0; i < num_elements; i++)
  {
    output_data[i] = 0.0f;
  }
 
  // Loop through input elements one at a time.
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int in_y = 0; in_y < input_height; ++in_y)
    {
      for (int in_x = 0; in_x < input_width; ++in_x)
      {
        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
        {
          // Loop through the output elements it will influence
          const int out_x_origin = (in_x * stride_width) - pad_width;
          const int out_y_origin = (in_y * stride_height) - pad_height;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
          {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
            {
              for (int out_channel = 0; out_channel < output_depth; ++out_channel)
              {
                // Compute output element location
                const int out_x = out_x_origin + filter_x;
                const int out_y = out_y_origin + filter_y;
                // We cannot accumulate out of bounds
                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
                    (out_y < output_height))
                {
                  float input_value =
                    input_data[offset(input_shape.dimsData(), batch, in_y, in_x, in_channel)];
                  float filter_value = filter_data[offset(filter_shape.dimsData(), out_channel,
                                                          filter_y, filter_x, in_channel)];
                  output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)] +=
                    input_value * filter_value;
                }
              }
            }
          }
        }
      }
    }
  }
 
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int out_y = 0; out_y < output_height; ++out_y)
    {
      for (int out_x = 0; out_x < output_width; ++out_x)
      {
        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
        {
          float acc =
            output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)];
          if (bias_data)
            acc += bias_data[out_channel];
 
          output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)] =
            activationFunctionWithMinMax(acc, output_activation_min, output_activation_max);
        }
      }
    }
  }
}

References activationFunctionWithMinMax(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::flatSize(), luci_interpreter_pal::ConvParams::float_activation_max, luci_interpreter_pal::ConvParams::float_activation_min, luci_interpreter_pal::PaddingValues::height, offset(), output_shape, luci_interpreter_pal::ConvParams::padding_values, luci_interpreter_pal::ConvParams::stride_height, luci_interpreter_pal::ConvParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

◆ TransposeImpl()

template<typename T , int N>

void luci_interpreter_pal::TransposeImpl	(	const TransposeParams &	params,
		const luci_interpreter::RuntimeShape &	unextended_input_shape,
		const T *	input_data,
		const luci_interpreter::RuntimeShape &	unextended_output_shape,
		T *	output_data
	)

Definition at line 27 of file PALTranspose.h.

{
  const int unextended_input_size = unextended_input_shape.dimensionsCount();
  const int unextended_output_size = unextended_output_shape.dimensionsCount();
 
  const int input_ext_size = N - unextended_input_size;
  const int output_ext_size = N - unextended_output_size;
  NdArrayDesc<N> input_desc;
  NdArrayDesc<N> output_desc;
  copyDimsToDesc(luci_interpreter::RuntimeShape::extendedShape(N, unextended_input_shape),
                 &input_desc);
  copyDimsToDesc(luci_interpreter::RuntimeShape::extendedShape(N, unextended_output_shape),
                 &output_desc);
 
  // The perm data is extended to match the output, each index incremented by
  // the amount of front padding of the input shape.
  int extended_perm[N];
  for (int i = 0; i < N; ++i)
  {
    extended_perm[i] = i < output_ext_size ? i : params.perm[i - output_ext_size] + input_ext_size;
  }
 
  // Permutes the input shape so we don't need to permute the indexes inside
  // the loop. Check to make sure output_dims is matching input_dims.
  NdArrayDesc<N> perm_input_desc;
  for (int k = 0; k < N; ++k)
  {
    perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
    perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
  }
 
  // Naive transpose loop (iterate on output index and compute input index).
  auto tranpose_func = [&](int indexes[N]) {
    output_data[subscriptToIndex(output_desc, indexes)] =
      input_data[subscriptToIndex(perm_input_desc, indexes)];
  };
  NDOpsHelper<N>(output_desc, tranpose_func);
}

References copyDimsToDesc(), luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter_pal::TransposeParams::perm, luci_interpreter_pal::NdArrayDesc< N >::strides, and subscriptToIndex().

Variable Documentation

◆ MAX_INDICES_ND

constexpr int luci_interpreter_pal::MAX_INDICES_ND = 5

constexpr

Definition at line 27 of file PALGatherND.h.

Referenced by luci_interpreter::configure_kernel_CircleGatherND(), and GatherND().

Namespaces

Data Structures

Enumerations

Functions

Variables

Enumeration Type Documentation

◆ BroadcastableOpCategory

◆ FusedActivationFunctionType

◆ PaddingType

Function Documentation

◆ Abs()

◆ activationFunctionWithMinMax()

◆ Add()

◆ Add< int16_t >() [1/2]

◆ Add< int16_t >() [2/2]

◆ Add< int8_t >() [1/2]

◆ Add< int8_t >() [2/2]

◆ AddN()

◆ ArgMinMax()

◆ ArithmeticOp()

◆ ArithmeticOpScalar()

◆ AveragePool() [1/2]

◆ AveragePool() [2/2]

◆ AveragePool< int8_t >() [1/3]

◆ AveragePool< int8_t >() [2/3]

◆ AveragePool< int8_t >() [3/3]

◆ BatchMatMul()

◆ BatchToSpaceND()

◆ BinaryOp()

◆ BroadcastAdd4DSlow()

◆ BroadcastArithmeticOp4DSlow()

◆ BroadcastBinaryOp4DSlow()

◆ BroadcastComparison4DSlowNoScaling()

◆ BroadcastComparison4DSlowWithScaling()

◆ BroadcastDiv4DSlow()

◆ BroadcastFloorDiv4DSlow()

◆ BroadcastFloorMod4DSlow()

◆ BroadcastImpl()

◆ BroadcastMaximum4DSlow()

◆ BroadcastMinimum4DSlow()

◆ BroadcastMul4DSlow()

◆ BroadcastPrelu4DSlowFloat()

◆ BroadcastSub4DSlow()

◆ BroadcastTISO4DSlow()

◆ BroadcastTo()

◆ calculateGRU()

◆ Ceil()

◆ ComparisonNoScaling()

◆ ComparisonWithScaling()

◆ ComputeInterpolationValues()

◆ Concatenation()

◆ copyDimsToDesc()

◆ Cos()

◆ DepthToSpace()

◆ DepthwiseConvPerChannel< int8_t >() [1/3]

◆ DepthwiseConvPerChannel< int8_t >() [2/3]

◆ DepthwiseConvPerChannel< int8_t >() [3/3]

◆ Dequantize()

◆ Div()

◆ DivScalar()

◆ Elu()

◆ EqualFn()

◆ eval_integer_8x8_16_lstm()

◆ evalLSTM()

◆ evalLSTM< int8_t, int8_t, int16_t, int32_t >()

◆ Exp()

◆ flatSizeSkipDim()

◆ Floor()

◆ FloorDiv()

◆ FloorMod()

◆ FullyConnected() [1/5]

◆ FullyConnected() [2/5]

◆ FullyConnected() [3/5]

◆ FullyConnected() [4/5]

◆ FullyConnected() [5/5]

◆ FullyConnected< int8_t >() [1/4]

◆ FullyConnected< int8_t >() [2/4]

◆ FullyConnected< int8_t >() [3/4]

◆ FullyConnected< int8_t >() [4/4]

◆ GatherND()