ONE - On-device Neural Engine
Loading...
Searching...
No Matches
luci_interpreter_pal Namespace Reference

Namespaces

namespace  lstm
 
namespace  lstm_internal
 

Data Structures

struct  AddFn
 
struct  ArithmeticParams
 
struct  ComparisonParams
 
struct  ConcatenationParams
 
struct  ConvParams
 
struct  DivFn
 
struct  FloorDivFn
 
struct  FloorModFn
 
struct  FullyConnectedParams
 
struct  MaximumFn
 
struct  MeanParams
 
struct  MinimumFn
 
struct  MulFn
 
struct  NdArrayDesc
 
struct  PaddingValues
 
struct  PadParams
 
struct  PoolParams
 
struct  PreluParams
 
struct  QuantizationParams
 
struct  ResizeNearestNeighborParams
 
struct  SoftmaxParams
 
struct  StridedSliceParams
 
struct  SubFn
 
struct  TransposeParams
 

Enumerations

enum class  PaddingType : uint8_t { None , Same , Valid }
 
enum class  BroadcastableOpCategory : uint8_t {
  kNone , kNonBroadcast , kFirstInputBroadcastsFast , kSecondInputBroadcastsFast ,
  kGenericBroadcast , kScalarFirstBroadcast , kScalarSecondBroadcast
}
 
enum class  FusedActivationFunctionType : uint8_t { kNone , kRelu6 , kRelu1 , kRelu }
 

Functions

template<>
void AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
template<>
void DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
template<>
void FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data)
 
template<>
void Softmax< int8_t > (const tflite::SoftmaxParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &output_shape, int8_t *output_data)
 
template<>
void AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
void BatchMatMul (const tflite::RuntimeShape &lhs_shape, const float *lhs_data, const tflite::RuntimeShape &rhs_shape, const float *rhs_data, const tflite::RuntimeShape &output_shape, float *output_data)
 
template<>
void DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
template<>
void FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data)
 
void Logistic (const int flat_size, const float *input_data, float *output_data)
 
void calculateGRU (const float *input_data, const float *weight_input_data, const float *weight_hidden_data, const float *bias_input_data, const float *bias_hidden_data, float *output_data, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, const tflite::RuntimeShape &weight_hidden_shape, float *output_input_data, float *output_hidden_data, const tflite::RuntimeShape &output_shape_fc)
 
void GRU (const float *input_data, const float *weight_input_data, const float *weight_hidden_data, const float *bias_input_data, const float *bias_hidden_data, const float *hidden_state_data, float *output_data, float *output_input_data, float *output_hidden_data, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, const tflite::RuntimeShape &weight_hidden_shape)
 
template<>
void Mul (tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape, const int64_t *input1_data, const tflite::RuntimeShape &input2_shape, const int64_t *input2_data, const tflite::RuntimeShape &output_shape, int64_t *output_data)
 
template<>
void AveragePool< int8_t > (const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
template<>
void DepthwiseConvPerChannel< int8_t > (const tflite::DepthwiseParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data, const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
 
template<>
void FullyConnected< int8_t > (const tflite::FullyConnectedParams &params, const tflite::RuntimeShape &input_shape, const int8_t *input_data, const tflite::RuntimeShape &filter_shape, const int8_t *filter_data, const tflite::RuntimeShape &bias_shape, const int32_t *bias_data, const tflite::RuntimeShape &output_shape, int8_t *output_data)
 
template<>
void Add< int8_t > (const ArithmeticParams &params, const int flat_size, const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
 
template<>
void Add< int16_t > (const ArithmeticParams &params, const int flat_size, const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data)
 
void AveragePool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const uint8_t *input_data, const luci_interpreter::RuntimeShape &output_shape, uint8_t *output_data, luci_interpreter::DataType data_type)
 
template<>
void FullyConnected< int8_t > (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t *, const int8_t *input_data, const int32_t *filter_shape, const int8_t *filter_data, const int32_t *bias_data, const int32_t *output_shape, int8_t *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)
 
template<>
void FullyConnected (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t *, const int16_t *input_data, const int32_t *filter_shape, const int8_t *filter_data, const int64_t *bias_data, const int32_t *output_shape, int16_t *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)
 
void MaxPool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const uint8_t *input_data, const luci_interpreter::RuntimeShape &output_shape, uint8_t *output_data, luci_interpreter::DataType data_type)
 
template<>
void Mul< int8_t > (const ArithmeticParams &params, const int flat_size, const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
 
template<>
void Mul< int16_t > (const ArithmeticParams &params, const int flat_size, const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data)
 
void Softmax (const SoftmaxParams &params, const int8_t *input_data, int8_t *output_data)
 
void Softmax (const SoftmaxParams &params, const int8_t *input_data, int16_t *output_data)
 
void Softmax (const SoftmaxParams &params, const int16_t *input_data, int16_t *output_data)
 
void eval_integer_8x8_16_lstm (const luci_interpreter::Tensor *input, const luci_interpreter::Tensor *input_to_input_weights, const luci_interpreter::Tensor *input_to_forget_weights, const luci_interpreter::Tensor *input_to_cell_weights, const luci_interpreter::Tensor *input_to_output_weights, const luci_interpreter::Tensor *recurrent_to_input_weights, const luci_interpreter::Tensor *recurrent_to_forget_weights, const luci_interpreter::Tensor *recurrent_to_cell_weights, const luci_interpreter::Tensor *recurrent_to_output_weights, const luci_interpreter::Tensor *cell_to_input_weights, const luci_interpreter::Tensor *cell_to_forget_weights, const luci_interpreter::Tensor *cell_to_output_weights, const luci_interpreter::Tensor *input_layer_norm_coefficients, const luci_interpreter::Tensor *forget_layer_norm_coefficients, const luci_interpreter::Tensor *cell_layer_norm_coefficients, const luci_interpreter::Tensor *output_layer_norm_coefficients, const luci_interpreter::Tensor *input_gate_bias, const luci_interpreter::Tensor *forget_gate_bias, const luci_interpreter::Tensor *cell_gate_bias, const luci_interpreter::Tensor *output_gate_bias, const luci_interpreter::Tensor *projection_weights, const luci_interpreter::Tensor *projection_bias, const luci_interpreter::UnidirectionalSequenceLSTMParams &params, bool forward_sequence, bool time_major, const luci_interpreter::IntegerLSTMParams &integer_lstm_param, int32_t output_state_zp, luci_interpreter::Tensor *output_state, luci_interpreter::Tensor *cell_state, luci_interpreter::Tensor *output, int16_t *scratch0, int16_t *scratch1, int16_t *scratch2, int16_t *scratch3, int8_t *scratch4, int32_t *scratch5)
 
template<typename T >
void BroadcastTISO4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data, std::function< const T &(const T &, const T &)> func)
 
void Abs (const int flat_size, const float *input_data, float *output_data)
 
template<typename T >
void Add (const ArithmeticParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T >
void BroadcastAdd4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
template<typename T >
void AddN (const size_t flat_size, const size_t num_inputs, const T *const *input_data, T *output_data)
 
template<typename T1 , typename T2 , typename T3 , typename Cmp >
void ArgMinMax (const luci_interpreter::RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data, const luci_interpreter::RuntimeShape &output_shape, T2 *output_data, const Cmp &cmp)
 
template<typename T , typename Fn >
void ArithmeticOp (const ArithmeticParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T , typename Fn >
void ArithmeticOpScalar (const ArithmeticParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
 
template<typename T , typename Fn >
void BroadcastArithmeticOp4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
void AveragePool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
template<typename T >
void BatchToSpaceND (const luci_interpreter::RuntimeShape &unextended_input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const int32_t *block_shape_data, const luci_interpreter::RuntimeShape &unextended_input3_shape, const int32_t *crops_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
template<typename T , typename Fn >
void BinaryOp (const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T , typename Fn >
void BroadcastBinaryOp4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
template<int N>
void BroadcastImpl (const NdArrayDesc< N > &input_desc, const uint8_t *input_data, const NdArrayDesc< N > &output_desc, uint8_t *output_data, int indexes[N], int dim, const int last_broadcasting_dim, const uint32_t type_size)
 
template<int N>
void BroadcastTo (const luci_interpreter::RuntimeShape &unextended_input_shape, const uint8_t *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, uint8_t *output_data, luci_interpreter::DataType data_type)
 
void Ceil (const int32_t flat_size, const float *input_data, float *output_data)
 
template<typename T >
bool LessFn (T lhs, T rhs)
 
template<typename T >
bool LessEqualFn (T lhs, T rhs)
 
template<typename T >
bool EqualFn (T lhs, T rhs)
 
template<typename T >
bool GreaterFn (T lhs, T rhs)
 
template<typename T >
bool GreaterEqualFn (T lhs, T rhs)
 
template<typename T >
bool NotEqualFn (T lhs, T rhs)
 
template<typename T >
void ComparisonNoScaling (const int64_t flat_size, const T *input1_data, const T *input2_data, bool *output_data, bool F(T, T))
 
template<typename T >
void BroadcastComparison4DSlowWithScaling (const ComparisonParams &op_params, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, bool *output_data, bool F(T, T))
 
template<typename T >
void ComparisonWithScaling (const ComparisonParams &op_params, const int64_t flat_size, const T *input1_data, const T *input2_data, bool *output_data, bool F(T, T))
 
template<typename T >
void BroadcastComparison4DSlowNoScaling (const ComparisonParams &op_params, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, bool *output_data, bool F(T, T))
 
template<typename Scalar >
void Concatenation (const ConcatenationParams &params, const luci_interpreter::RuntimeShape *const *input_shapes, const Scalar *const *input_data, const luci_interpreter::RuntimeShape &output_shape, Scalar *output_data)
 
void Cos (const int flat_size, const float *input_data, float *output_data)
 
template<typename T >
void DepthToSpace (const int32_t block_size, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
template<typename InputT , typename OutputT >
void Dequantize (const QuantizationParams &op_params, const int flat_size, const InputT *input_data, OutputT *output_data)
 
template<typename T >
void Div (const ArithmeticParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T >
void DivScalar (const ArithmeticParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
 
template<typename T >
void BroadcastDiv4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
void Elu (const int flat_size, const float *input_data, float *output_data)
 
void Exp (const int flat_size, const float *input_data, float *output_data)
 
void Floor (const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void FloorDiv (const int flat_size, const float *input1_data, const float *input2_data, float *output_data)
 
void BroadcastFloorDiv4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void FloorMod (const int flat_size, const float *input1_data, const float *input2_data, float *output_data)
 
void BroadcastFloorMod4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
template<typename InputType , typename WeightType , typename OutputType , typename BiasType >
void FullyConnected (const FullyConnectedParams &params, const int32_t *input_shape, const InputType *input_data, const int32_t *filter_shape, const WeightType *filter_data, const BiasType *bias_data, const int32_t *output_shape, OutputType *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)
 
template<typename WeightType >
void FullyConnected (const FullyConnectedParams &params, const int32_t *input_shape, const float *input_data, const int32_t *filter_shape, const WeightType *filter_data, const float *bias_data, const int32_t *output_shape, float *output_data, uint32_t output_dims_count, uint32_t weights_dims_count)
 
template<typename ParamsT , typename IndicesT >
void GatherND (luci_interpreter::RuntimeShape params_shape, const ParamsT *param_data, luci_interpreter::RuntimeShape indices_shape, const IndicesT *index_data, ParamsT *output_data)
 
void L2Normalization (const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data, float epsilon=1e-6)
 
void L2Pool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void Log (const int flat_size, const float *input_data, float *output_data)
 
void LogicalCommon (const int flat_size, const bool *input1_data, const bool *input2_data, bool *output_data, bool(*f)(bool, bool))
 
void LogicalNot (const int flat_size, const bool *input_data, bool *output_data)
 
void Logistic (const int flat_size, const int8_t *input_data, float input_scale, int input_zero_point, int8_t *output_data, float output_scale, int output_zero_point)
 
void Logistic (int32_t input_multiplier, int32_t input_left_shift, int32_t input_size, const int16_t *ptr_input_data, int16_t *ptr_output_data)
 
void LogSoftmax (const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void Maximum (const int flat_size, const float *input1_data, const float *input2_data, float *output_data)
 
void BroadcastMaximum4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void MaxPool (const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
template<typename T , typename U >
bool Mean (const T *input_data, const int *input_dims, const int input_num_dims, T *output_data, const int *output_dims, const int output_num_dims, const int *axis, const int num_axis_dimensions, bool, int *temp_index, int *resolved_axis, U *temp_sum)
 
void Mean (const MeanParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const float *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, float *output_data)
 
void Minimum (const int flat_size, const float *input1_data, const float *input2_data, float *output_data)
 
template<typename T >
void BroadcastMinimum4DSlow (const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
template<typename T >
void MirrorPad (const luci_interpreter::DataType padding_matrix_type, const uint8_t *padding_matrix_data, const int32_t *input_dims, int *output_dims_num_elements, int *input_dims_num_elements, const T *input_data, T *output_data, const int offset, const int num_dims, const int output_size)
 
template<typename T >
void Mul (const ArithmeticParams &params, const int flat_size, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T >
void MulScalar (const ArithmeticParams &params, const int flat_size, const T *input_data, const T scalar_value, T *output_data)
 
template<typename T >
void BroadcastMul4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
template<typename T >
void Negate (const luci_interpreter::RuntimeShape &input_shape, const T *input_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
constexpr int PadKernelMaxDimensionCount ()
 
void Pad (const PadParams &op_params, const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const float *pad_value_ptr, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
void BroadcastPrelu4DSlowFloat (const luci_interpreter::RuntimeShape &unextended_input1_shape, const float *input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const float *input2_data, const luci_interpreter::RuntimeShape &unextended_output_shape, float *output_data)
 
template<typename InputT , typename OutputT >
void Quantize (const QuantizationParams &op_params, const int flat_size, const InputT *input_data, OutputT *output_data)
 
template<typename T >
void ReduceGeneric (const T *input_data, const int *input_dims, const int input_num_dims, T *output_data, const int *axis, const int64_t num_axis_dimensions, T init_value, const int output_flat_size, T reducer(const T, const T))
 
void ReLUCommon (const int flat_size, const float *input_data, float *output_data, const float alpha, const bool is_relu_6)
 
int Offset (const luci_interpreter::RuntimeShape &shape, int i0, int i1, int i2, int i3)
 
void ComputeInterpolationValues (const float value, const float scale, const bool half_pixel_centers, int32_t input_size, float *scaled_value, int32_t *lower_bound, int32_t *upper_bound)
 
int32_t getNearestNeighbor (const int input_value, const int32_t input_size, const int32_t output_size, const bool align_corners, const bool half_pixel_centers)
 
template<typename T >
void ResizeNearestNeighbor (const ResizeNearestNeighborParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, const luci_interpreter::RuntimeShape &output_size_shape, const int32_t *output_size_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
float RoundToNearest (float value)
 
void Round (const int32_t flat_size, const float *input_data, float *output_data)
 
void Rsqrt (const int flat_size, const float *input_data, float *output_data)
 
template<typename D , typename T >
void Select (const luci_interpreter::RuntimeShape &input_condition_shape, const D *input_condition_data, const luci_interpreter::RuntimeShape &input_x_shape, const T *input_x_data, const luci_interpreter::RuntimeShape &input_y_shape, const T *input_y_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
void Sin (const int flat_size, const float *input_data, float *output_data)
 
void Softmax (const SoftmaxParams &params, const float *input_data, float *output_data)
 
template<typename T >
void SpaceToBatchND (const int32_t pad_value, const luci_interpreter::RuntimeShape &unextended_input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &unextended_input2_shape, const int32_t *block_shape_data, const luci_interpreter::RuntimeShape &unextended_input3_shape, const int32_t *paddings_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
template<typename T >
void SpaceToDepth (const int32_t block_size, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
void Sqrt (const int flat_size, const float *input_data, float *output_data)
 
void Square (const int flat_size, const float *input_data, float *output_data)
 
void SquaredDifference (const int flat_size, const float *input_data_1, const float *input_data_2, float *output_data)
 
template<typename T >
void StridedSlice (StridedSliceParams &op_params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, T *output_data)
 
template<typename T >
void BroadcastSub4DSlow (const ArithmeticParams &params, const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 
void SVDF (const float *input_data, const float *weights_feature_data, const float *weights_time_data, const float *bias_data, float *state_data, float *scratch_data, float *output_data, const int rank, const int input_size, const int batch_size, const int num_filters, const int num_units, const int memory_size, const circle::ActivationFunctionType activation)
 
void Tanh (const int flat_size, const float *input_data, float *output_data)
 
void Tanh (int32_t input_multiplier, int32_t input_left_shift, const int flat_size, const int16_t *ptr_input_data, int16_t *ptr_output_data)
 
template<typename T , int N>
void TransposeImpl (const TransposeParams &params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
template<typename T , int N = 5>
void Transpose (const TransposeParams &params, const luci_interpreter::RuntimeShape &unextended_input_shape, const T *input_data, const luci_interpreter::RuntimeShape &unextended_output_shape, T *output_data)
 
void TransposeConv (const ConvParams &params, const luci_interpreter::RuntimeShape &input_shape, const float *input_data, const luci_interpreter::RuntimeShape &filter_shape, const float *filter_data, const luci_interpreter::RuntimeShape &bias_shape, const float *bias_data, const luci_interpreter::RuntimeShape &output_shape, float *output_data)
 
template<typename ActivationType , typename WeightType , typename CellType , typename BiasType >
void evalLSTM (luci_interpreter::lstm::LSTMStruct *lstm_struct, luci_interpreter::lstm::LSTMParameters *lstm_params, luci_interpreter::lstm::CellStateInfo *cell_state_info, ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0, CellType *scratch1, CellType *scratch2, CellType *scratch3, luci_interpreter::BaseRuntimeGraph *runtime_graph)
 
std::int32_t saturatingRoundingDoublingHighMul (std::int32_t a, std::int32_t b)
 
int32_t roundingDivideByPOT (int32_t x, int32_t exponent)
 
int32_t multiplyByQuantizedMultiplier (int32_t x, int32_t quantized_multiplier, int shift)
 
int32_t multiplyByQuantizedMultiplierSmallerThanOneExp (int32_t x, int32_t quantized_multiplier, int left_shift)
 
template<typename P >
void getActivationParams (const P &params, int32_t *min, int32_t *max)
 
template<typename P >
void getActivationParams (const P &params, float *min, float *max)
 
template<typename P >
void getActivationParams (const P &params, int64_t *min, int64_t *max)
 
size_t reducedOutputOffset (const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
 
bool nextIndex (const int num_dims, const int *dims, int *current)
 
int MatchingDim (const luci_interpreter::RuntimeShape &shape1, int index1, const luci_interpreter::RuntimeShape &shape2, int index2)
 
int flatSizeSkipDim (const int32_t *dims_data, int skip_dim, int num_dims)
 
int offset (const int32_t *dims_data, int i0, int i1, int i2, int i3)
 
int offset (const int32_t *dims_data, int i0, int i1, int i2, int i3, int i4)
 
template<typename T >
activationFunctionWithMinMax (T x, T output_activation_min, T output_activation_max)
 
template<int N>
void copyDimsToDesc (const luci_interpreter::RuntimeShape &input_shape, NdArrayDesc< N > *desc_out)
 
template<int N, int DIM, typename Calc >
std::enable_if< DIM==N-1, void >::type NDOpsHelperImpl (const NdArrayDesc< N > &output, const Calc &calc, int indexes[N])
 
template<int N, int DIM, typename Calc >
std::enable_if< DIM!=N-1, void >::type NDOpsHelperImpl (const NdArrayDesc< N > &output, const Calc &calc, int indexes[N])
 
template<int N, typename Calc >
void NDOpsHelper (const NdArrayDesc< N > &output, const Calc &calc)
 
template<int N>
void NdArrayDescsForElementwiseBroadcast (const luci_interpreter::RuntimeShape &input0_shape, const luci_interpreter::RuntimeShape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
 
int subscriptToIndex (const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
 
int subscriptToIndex (const NdArrayDesc< 5 > &desc, int indexes[5])
 
bool ProcessBroadcastShapes (const luci_interpreter::RuntimeShape &shape0, const luci_interpreter::RuntimeShape &shape1, luci_interpreter_pal::ArithmeticParams *params)
 
template<>
void Add< int8_t > (const ArithmeticParams &, const int, const int8_t *, const int8_t *, int8_t *)
 
template<>
void Add< int16_t > (const ArithmeticParams &, const int, const int16_t *, const int16_t *, int16_t *)
 
template<>
void FullyConnected (const luci_interpreter_pal::FullyConnectedParams &params, const int32_t *input_shape, const int8_t *input_data, const int32_t *filter_shape, const int8_t *filter_data, const int32_t *bias_data, const int32_t *output_shape, int8_t *output_data, uint32_t, uint32_t)
 
template<>
void FullyConnected (const luci_interpreter_pal::FullyConnectedParams &, const int32_t *, const int16_t *, const int32_t *, const int8_t *, const int64_t *, const int32_t *, int16_t *, uint32_t, uint32_t)
 
template<>
void Mul< int8_t > (const ArithmeticParams &, const int, const int8_t *, const int8_t *, int8_t *)
 
template<>
void Mul< int16_t > (const ArithmeticParams &, const int, const int16_t *, const int16_t *, int16_t *)
 
template<>
void evalLSTM< int8_t, int8_t, int16_t, int32_t > (luci_interpreter::lstm::LSTMStruct *lstm_struct, luci_interpreter::lstm::LSTMParameters *lstm_params, luci_interpreter::lstm::CellStateInfo *cell_state_info, int8_t *output_state_data, int16_t *cell_state_data, int16_t *scratch0, int16_t *scratch1, int16_t *scratch2, int16_t *scratch3, luci_interpreter::BaseRuntimeGraph *runtime_graph)
 

Variables

constexpr int MAX_INDICES_ND = 5
 

Enumeration Type Documentation

◆ BroadcastableOpCategory

enum class luci_interpreter_pal::BroadcastableOpCategory : uint8_t
strong
Enumerator
kNone 
kNonBroadcast 
kFirstInputBroadcastsFast 
kSecondInputBroadcastsFast 
kGenericBroadcast 
kScalarFirstBroadcast 
kScalarSecondBroadcast 

Definition at line 108 of file Params.h.

109{
110 kNone,
111 kNonBroadcast, // Matching input shapes.
112 kFirstInputBroadcastsFast, // Fivefold nested loops.
113 kSecondInputBroadcastsFast, // Fivefold nested loops.
114 kGenericBroadcast, // Fall-back.
115 kScalarFirstBroadcast, // Scalar
116 kScalarSecondBroadcast, // Scalar
117};

◆ FusedActivationFunctionType

Enumerator
kNone 
kRelu6 
kRelu1 
kRelu 

Definition at line 215 of file Params.h.

◆ PaddingType

enum class luci_interpreter_pal::PaddingType : uint8_t
strong
Enumerator
None 
Same 
Valid 

Definition at line 64 of file Params.h.

Function Documentation

◆ Abs()

void luci_interpreter_pal::Abs ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 25 of file PALAbs.h.

26{
27 for (int i = 0; i < flat_size; ++i)
28 {
29 output_data[i] = std::abs(input_data[i]);
30 }
31}

Referenced by luci_interpreter::execute_kernel_CircleAbs().

◆ activationFunctionWithMinMax()

template<typename T >
T luci_interpreter_pal::activationFunctionWithMinMax ( x,
output_activation_min,
output_activation_max 
)
inline

Definition at line 204 of file PALUtils.h.

205{
206 using std::max;
207 using std::min;
208 return min(max(x, output_activation_min), output_activation_max);
209}

Referenced by L2Pool(), and TransposeConv().

◆ Add()

template<typename T >
void luci_interpreter_pal::Add ( const ArithmeticParams params,
const int  flat_size,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 28 of file PALAddCommon.h.

30{
31 ArithmeticOp<T, AddFn<T>>(params, flat_size, input1_data, input2_data, output_data);
32}

◆ Add< int16_t >() [1/2]

template<>
void luci_interpreter_pal::Add< int16_t > ( const ArithmeticParams ,
const int  ,
const int16_t *  ,
const int16_t *  ,
int16_t *   
)
inline

Definition at line 33 of file PALAdd.h.

35{
36 assert(false && "Not IMPL yet");
37}

◆ Add< int16_t >() [2/2]

template<>
void luci_interpreter_pal::Add< int16_t > ( const ArithmeticParams params,
const int  flat_size,
const int16_t *  input1_data,
const int16_t *  input2_data,
int16_t *  output_data 
)
inline

Definition at line 39 of file PALAdd.h.

42{
43 auto status = arm_elementwise_add_s16(
44 input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
45 params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
46 output_data, params.output_offset, params.output_multiplier, params.output_shift,
47 params.quantized_activation_min, params.quantized_activation_max, flat_size);
48 assert(status == ARM_CMSIS_NN_SUCCESS);
49}

References luci_interpreter_pal::ArithmeticParams::input1_multiplier, luci_interpreter_pal::ArithmeticParams::input1_offset, luci_interpreter_pal::ArithmeticParams::input1_shift, luci_interpreter_pal::ArithmeticParams::input2_multiplier, luci_interpreter_pal::ArithmeticParams::input2_offset, luci_interpreter_pal::ArithmeticParams::input2_shift, luci_interpreter_pal::ArithmeticParams::left_shift, luci_interpreter_pal::ArithmeticParams::output_multiplier, luci_interpreter_pal::ArithmeticParams::output_offset, luci_interpreter_pal::ArithmeticParams::output_shift, luci_interpreter_pal::ArithmeticParams::quantized_activation_max, and luci_interpreter_pal::ArithmeticParams::quantized_activation_min.

◆ Add< int8_t >() [1/2]

template<>
void luci_interpreter_pal::Add< int8_t > ( const ArithmeticParams ,
const int  ,
const int8_t *  ,
const int8_t *  ,
int8_t *   
)
inline

Definition at line 26 of file PALAdd.h.

28{
29 assert(false && "Not IMPL yet");
30}

◆ Add< int8_t >() [2/2]

◆ AddN()

template<typename T >
void luci_interpreter_pal::AddN ( const size_t  flat_size,
const size_t  num_inputs,
const T *const *  input_data,
T *  output_data 
)
inline

Definition at line 29 of file PALAddN.h.

31{
32 // All inputs and output should have the same shape, this is checked during
33 // Prepare stage.
34 for (size_t i = 0; i < flat_size; ++i)
35 {
36 T x = 0;
37 for (size_t j = 0; j < num_inputs; ++j)
38 {
39 x += input_data[j][i];
40 }
41 output_data[i] = x;
42 }
43}

◆ ArgMinMax()

template<typename T1 , typename T2 , typename T3 , typename Cmp >
void luci_interpreter_pal::ArgMinMax ( const luci_interpreter::RuntimeShape input1_shape,
const T1 *  input1_data,
const T3 *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T2 *  output_data,
const Cmp &  cmp 
)

Definition at line 28 of file PALArgMinMax.h.

31{
32 int axis = input2_data[0];
33 if (axis < 0)
34 {
35 axis += input1_shape.dimensionsCount();
36 }
37 const int axis_size = input1_shape.dims(axis);
38
39 int outer_size = 1;
40 for (int i = 0; i < axis; ++i)
41 {
42 outer_size *= input1_shape.dims(i);
43 }
44
45 int inner_size = 1;
46 const int dims_count = input1_shape.dimensionsCount();
47 for (int i = axis + 1; i < dims_count; ++i)
48 {
49 inner_size *= input1_shape.dims(i);
50 }
51 for (int outer = 0; outer < outer_size; ++outer)
52 {
53 for (int inner = 0; inner < inner_size; ++inner)
54 {
55 auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
56 T2 min_max_index = 0;
57 for (int i = 1; i < axis_size; ++i)
58 {
59 const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner];
60 if (cmp(curr_value, min_max_value))
61 {
62 min_max_value = curr_value;
63 min_max_index = static_cast<T2>(i);
64 }
65 }
66 output_data[outer * inner_size + inner] = min_max_index;
67 }
68 }
69}
int32_t dimensionsCount() const
Definition Tensor.h:106
int32_t dims(int i) const
Definition Tensor.h:108

References luci_interpreter::RuntimeShape::dimensionsCount(), and luci_interpreter::RuntimeShape::dims().

◆ ArithmeticOp()

template<typename T , typename Fn >
void luci_interpreter_pal::ArithmeticOp ( const ArithmeticParams params,
const int  flat_size,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 47 of file PALArithmeticOpCommon.h.

49{
50 T activation_min, activation_max;
51 getActivationParams(params, &activation_min, &activation_max);
52
53 Fn func;
54 for (int i = 0; i < flat_size; ++i)
55 output_data[i] =
56 std::min(std::max(func(input1_data[i], input2_data[i]), activation_min), activation_max);
57}
void getActivationParams(const P &params, int32_t *min, int32_t *max)
Definition PALUtils.h:93

References getActivationParams().

◆ ArithmeticOpScalar()

template<typename T , typename Fn >
void luci_interpreter_pal::ArithmeticOpScalar ( const ArithmeticParams params,
const int  flat_size,
const T *  input_data,
const T  scalar_value,
T *  output_data 
)
inline

Definition at line 60 of file PALArithmeticOpCommon.h.

62{
63 T activation_min, activation_max;
64 getActivationParams(params, &activation_min, &activation_max);
65
66 for (int i = 0; i < flat_size; ++i)
67 output_data[i] =
68 std::min(std::max(func(input_data[i], scalar_value), activation_min), activation_max);
69}

References getActivationParams().

◆ AveragePool() [1/2]

void luci_interpreter_pal::AveragePool ( const PoolParams params,
const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 28 of file PALAveragePool2DCommon.h.

31{
32 const int batches = input_shape.dims(0);
33 const int depth = output_shape.dims(3);
34 const int input_height = input_shape.dims(1);
35 const int input_width = input_shape.dims(2);
36 const int output_height = output_shape.dims(1);
37 const int output_width = output_shape.dims(2);
38 const int stride_height = params.stride_height;
39 const int stride_width = params.stride_width;
40 for (int batch = 0; batch < batches; ++batch)
41 {
42 for (int out_y = 0; out_y < output_height; ++out_y)
43 {
44 for (int out_x = 0; out_x < output_width; ++out_x)
45 {
46 for (int channel = 0; channel < depth; ++channel)
47 {
48 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
49 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
50 // Compute the boundaries of the filter region clamped so as to
51 // ensure that the filter window fits in the input array.
52 const int filter_x_start = std::max(0, -in_x_origin);
53 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
54 const int filter_y_start = std::max(0, -in_y_origin);
55 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
56
57 float total = 0.f;
58 float filter_count = 0;
59
60 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
61 {
62 for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
63 {
64 const int in_x = in_x_origin + filter_x;
65 const int in_y = in_y_origin + filter_y;
66
67 const int input_data_offset =
68 ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
69 input_shape.dims(3) +
70 channel;
71
72 total += input_data[input_data_offset];
73 filter_count++;
74 }
75 }
76 const int output_data_offset =
77 ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
79 channel;
80
81 assert(filter_count != 0);
82 const float average = total / filter_count;
83
84 output_data[output_data_offset] =
85 std::min(std::max(average, params.float_activation_min), params.float_activation_max);
86 }
87 }
88 }
89 }
90}
const luci_interpreter::RuntimeShape output_shape

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::PoolParams::filter_height, luci_interpreter_pal::PoolParams::filter_width, luci_interpreter_pal::PoolParams::float_activation_max, luci_interpreter_pal::PoolParams::float_activation_min, luci_interpreter_pal::PaddingValues::height, output_shape, luci_interpreter_pal::PoolParams::padding_values, luci_interpreter_pal::PoolParams::stride_height, luci_interpreter_pal::PoolParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

◆ AveragePool() [2/2]

void luci_interpreter_pal::AveragePool ( const PoolParams params,
const luci_interpreter::RuntimeShape input_shape,
const uint8_t *  input_data,
const luci_interpreter::RuntimeShape output_shape,
uint8_t *  output_data,
luci_interpreter::DataType  data_type 
)
inline

Definition at line 27 of file PALAveragePool2D.h.

31{
32 cmsis_nn_dims input_dims;
33 cmsis_nn_dims output_dims;
34 cmsis_nn_pool_params pool_params;
35 cmsis_nn_dims filter_dims;
36 cmsis_nn_context ctx;
37
38 const int depth = input_shape.dims(3);
39 const int output_width = output_shape.dims(2);
40
41 input_dims.n = 1;
42 input_dims.h = input_shape.dims(1);
43 input_dims.w = input_shape.dims(2);
44 input_dims.c = depth;
45
46 output_dims.n = 1;
47 output_dims.h = output_shape.dims(1);
48 output_dims.w = output_width;
49 output_dims.c = depth;
50
51 pool_params.stride.h = params.stride_height;
52 pool_params.stride.w = params.stride_width;
53 pool_params.padding.h = params.padding_values.height;
54 pool_params.padding.w = params.padding_values.width;
55 pool_params.activation.min = params.quantized_activation_min;
56 pool_params.activation.max = params.quantized_activation_max;
57
58 filter_dims.n = 1;
59 filter_dims.h = params.filter_height;
60 filter_dims.w = params.filter_width;
61 filter_dims.c = 1;
62
63 const int32_t buffer_size = data_type == luci_interpreter::DataType::S16
64 ? arm_avgpool_s16_get_buffer_size(output_width, depth)
65 : arm_avgpool_s8_get_buffer_size(output_width, depth);
66 int8_t *buffer = nullptr;
67 if (buffer_size > 0)
68 {
69 buffer = new int8_t[buffer_size];
70 }
71
72 ctx.buf = buffer;
73 ctx.size = buffer_size;
74
75 if (data_type == luci_interpreter::DataType::S8)
76 {
77 arm_avgpool_s8(&ctx, &pool_params, &input_dims,
78 luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
79 &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
80 }
81 else
82 {
83 arm_avgpool_s16(&ctx, &pool_params, &input_dims,
84 luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
85 &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
86 }
87
88 if (buffer_size > 0)
89 delete[] buffer;
90}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::PoolParams::filter_height, luci_interpreter_pal::PoolParams::filter_width, luci_interpreter_pal::PaddingValues::height, output_shape, luci_interpreter_pal::PoolParams::padding_values, luci_interpreter_pal::PoolParams::quantized_activation_max, luci_interpreter_pal::PoolParams::quantized_activation_min, luci_interpreter_pal::PoolParams::stride_height, luci_interpreter_pal::PoolParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

◆ AveragePool< int8_t >() [1/3]

template<>
void luci_interpreter_pal::AveragePool< int8_t > ( const tflite::PoolParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 47 of file PALAveragePool2d.h.

52{
53 assert(input_shape.DimensionsCount() == 4);
54 assert(output_shape.DimensionsCount() == 4);
55 assert(scratchpad_data != nullptr);
56
57 const int32_t batches = tflite::MatchingDim(input_shape, 0, output_shape, 0);
58 assert(batches == 1);
59
60 const int depth = tflite::MatchingDim(input_shape, 3, output_shape, 3);
61
62 cmsis_nn_dims input_dims;
63 input_dims.n = 1;
64 input_dims.h = input_shape.Dims(1);
65 input_dims.w = input_shape.Dims(2);
66 input_dims.c = depth;
67
68 cmsis_nn_dims output_dims;
69 output_dims.n = 1;
70 output_dims.h = output_shape.Dims(1);
71 output_dims.w = output_shape.Dims(2);
72 output_dims.c = depth;
73
74 cmsis_nn_pool_params pool_params;
75 pool_params.stride.h = params.stride_height;
76 pool_params.stride.w = params.stride_width;
77 pool_params.padding.h = params.padding_values.height;
78 pool_params.padding.w = params.padding_values.width;
79 pool_params.activation.min = params.quantized_activation_min;
80 pool_params.activation.max = params.quantized_activation_max;
81
82 cmsis_nn_dims filter_dims;
83 filter_dims.n = 1;
84 filter_dims.h = params.filter_height;
85 filter_dims.w = params.filter_width;
86 filter_dims.c = 1;
87
88 cmsis_nn_context ctx;
89 ctx.buf = scratchpad_data;
90 ctx.size = scratchpad_shape.Dims(0);
91 auto res = arm_avgpool_s8(&ctx, &pool_params, &input_dims, input_data, &filter_dims, &output_dims,
92 output_data);
93 assert(res == ARM_MATH_SUCCESS);
94}

References output_shape.

◆ AveragePool< int8_t >() [2/3]

template<>
void luci_interpreter_pal::AveragePool< int8_t > ( const tflite::PoolParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 45 of file PALAveragePool2d.h.

50{
51 (void)scratchpad_shape;
52 (void)scratchpad_data;
53
54 tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
55 output_data);
56}

References output_shape.

◆ AveragePool< int8_t >() [3/3]

template<>
void luci_interpreter_pal::AveragePool< int8_t > ( const tflite::PoolParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 45 of file PALAveragePool2d.h.

50{
51 (void)scratchpad_shape;
52 (void)scratchpad_data;
53
54 tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
55 output_data);
56}

References output_shape.

◆ BatchMatMul()

void luci_interpreter_pal::BatchMatMul ( const tflite::RuntimeShape &  lhs_shape,
const float *  lhs_data,
const tflite::RuntimeShape &  rhs_shape,
const float *  rhs_data,
const tflite::RuntimeShape &  output_shape,
float *  output_data 
)
inline

Definition at line 24 of file PALBatchMatMul.h.

27{
28 tflite::reference_ops::BatchMatMul(lhs_shape, lhs_data, rhs_shape, rhs_data, output_shape,
29 output_data);
30}

References output_shape.

Referenced by luci_interpreter::kernels::BatchMatMul::execute().

◆ BatchToSpaceND()

template<typename T >
void luci_interpreter_pal::BatchToSpaceND ( const luci_interpreter::RuntimeShape unextended_input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape unextended_input2_shape,
const int32_t *  block_shape_data,
const luci_interpreter::RuntimeShape unextended_input3_shape,
const int32_t *  crops_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 46 of file PALBatchToSpaceND.h.

51{
52 const luci_interpreter::RuntimeShape input1_shape =
53 extendShapeBatchToSpace(unextended_input1_shape);
55 extendShapeBatchToSpace(unextended_output_shape);
56
57 const int output_width = output_shape.dims(2);
58 const int output_height = output_shape.dims(1);
59 const int output_batch_size = output_shape.dims(0);
60
61 const int depth = input1_shape.dims(3);
62 const int input_width = input1_shape.dims(2);
63 const int input_height = input1_shape.dims(1);
64 const int input_batch_size = input1_shape.dims(0);
65
66 const int block_shape_height = block_shape_data[0];
67 const int block_shape_width =
68 unextended_input1_shape.dimensionsCount() == 4 ? block_shape_data[1] : 1;
69 const int crops_top = crops_data[0];
70 const int crops_left = unextended_input1_shape.dimensionsCount() == 4 ? crops_data[2] : 0;
71 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
72 {
73 const int out_batch = in_batch % output_batch_size;
74 const int spatial_offset = in_batch / output_batch_size;
75 for (int in_h = 0; in_h < input_height; ++in_h)
76 {
77 const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
78 if (out_h < 0 || out_h >= output_height)
79 {
80 continue;
81 }
82 for (int in_w = 0; in_w < input_width; ++in_w)
83 {
84 const int out_w =
85 in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
86
87 if (out_w < 0 || out_w >= output_width)
88 {
89 continue;
90 }
91 T *out = output_data + offset(output_shape.dimsData(), out_batch, out_h, out_w, 0);
92 const T *in = input1_data + offset(input1_shape.dimsData(), in_batch, in_h, in_w, 0);
93 memcpy(out, in, depth * sizeof(T));
94 }
95 }
96 }
97}
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), offset(), and output_shape.

◆ BinaryOp()

template<typename T , typename Fn >
void luci_interpreter_pal::BinaryOp ( const int  flat_size,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 56 of file PALBinaryOpCommon.h.

58{
59 Fn func;
60 for (int i = 0; i < flat_size; ++i)
61 {
62 output_data[i] = func(input1_data[i], input2_data[i]);
63 }
64}

◆ BroadcastAdd4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastAdd4DSlow ( const ArithmeticParams params,
const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 36 of file PALAddCommon.h.

40{
41 BroadcastArithmeticOp4DSlow<T, AddFn<T>>(params, input1_shape, input1_data, input2_shape,
42 input2_data, output_shape, output_data);
43}

References output_shape.

◆ BroadcastArithmeticOp4DSlow()

template<typename T , typename Fn >
void luci_interpreter_pal::BroadcastArithmeticOp4DSlow ( const ArithmeticParams params,
const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 72 of file PALArithmeticOpCommon.h.

76{
79 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
80 const luci_interpreter::RuntimeShape extended_output_shape =
82
83 T activation_min, activation_max;
84 getActivationParams(params, &activation_min, &activation_max);
85
86 // In Tensorflow, the dimensions are canonically named (batch_number, row,
87 // col, channel), with extents (batches, height, width, depth), with the
88 // trailing dimension changing most rapidly (channels has the smallest stride,
89 // typically 1 element).
90 //
91 // In generated C code, we store arrays with the dimensions reversed. The
92 // first dimension has smallest stride.
93 //
94 // We name our variables by their Tensorflow convention, but generate C code
95 // nesting loops such that the innermost loop has the smallest stride for the
96 // best cache behavior.
97 Fn func;
98 for (int b = 0; b < extended_output_shape.dims(0); ++b)
99 {
100 for (int y = 0; y < extended_output_shape.dims(1); ++y)
101 {
102 for (int x = 0; x < extended_output_shape.dims(2); ++x)
103 {
104 for (int c = 0; c < extended_output_shape.dims(3); ++c)
105 {
106 const int output_data_offset =
107 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
108 extended_output_shape.dims(3) +
109 c;
110
111 output_data[output_data_offset] =
112 std::min(std::max(func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
113 input2_data[subscriptToIndex(desc2, b, y, x, c)]),
114 activation_min),
115 activation_max);
116 }
117 }
118 }
119 }
120}
void NdArrayDescsForElementwiseBroadcast(const Dims< N > &input0_dims, const Dims< N > &input1_dims, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
Definition NDArray.h:89
static RuntimeShape extendedShape(int new_shape_size, const RuntimeShape &shape)
Definition Tensor.h:95
NdArrayDesc< 4 > desc1
NdArrayDesc< 4 > desc2
int subscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), getActivationParams(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastBinaryOp4DSlow()

template<typename T , typename Fn >
void luci_interpreter_pal::BroadcastBinaryOp4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const float *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const float *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 67 of file PALBinaryOpCommon.h.

73{
76 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
77
78 const luci_interpreter::RuntimeShape extended_output_shape =
80
81 // In Tensorflow, the dimensions are canonically named (batch_number, row,
82 // col, channel), with extents (batches, height, width, depth), with the
83 // trailing dimension changing most rapidly (channels has the smallest stride,
84 // typically 1 element).
85 //
86 // In generated C code, we store arrays with the dimensions reversed. The
87 // first dimension has smallest stride.
88 //
89 // We name our variables by their Tensorflow convention, but generate C code
90 // nesting loops such that the innermost loop has the smallest stride for the
91 // best cache behavior.
92
93 Fn func;
94 for (int b = 0; b < extended_output_shape.dims(0); ++b)
95 {
96 for (int y = 0; y < extended_output_shape.dims(1); ++y)
97 {
98 for (int x = 0; x < extended_output_shape.dims(2); ++x)
99 {
100 for (int c = 0; c < extended_output_shape.dims(3); ++c)
101 {
102 const int output_data_offset =
103 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
104 extended_output_shape.dims(3) +
105 c;
106
107 output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
108 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
109 }
110 }
111 }
112 }
113}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastComparison4DSlowNoScaling()

template<typename T >
void luci_interpreter_pal::BroadcastComparison4DSlowNoScaling ( const ComparisonParams op_params,
const luci_interpreter::RuntimeShape unextended_input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape unextended_input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
bool *  output_data,
bool   FT, T 
)
inline

Definition at line 144 of file PALComparisons.h.

149{
150 const BroadcastComparison4DSlowCommon dims = BroadcastComparison4DSlowPreprocess(
151 unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
152
153 for (int b = 0; b < dims.output_shape.dims(0); ++b)
154 {
155 for (int y = 0; y < dims.output_shape.dims(1); ++y)
156 {
157 for (int x = 0; x < dims.output_shape.dims(2); ++x)
158 {
159 for (int c = 0; c < dims.output_shape.dims(3); ++c)
160 {
161 const int output_data_offset =
162 ((b * dims.output_shape.dims(1) + y) * dims.output_shape.dims(2) + x) *
163 dims.output_shape.dims(3) +
164 c;
165 output_data[output_data_offset] =
166 F(input1_data[subscriptToIndex(dims.desc1, b, y, x, c)],
167 input2_data[subscriptToIndex(dims.desc2, b, y, x, c)]);
168 }
169 }
170 }
171 }
172}

References subscriptToIndex().

◆ BroadcastComparison4DSlowWithScaling()

template<typename T >
void luci_interpreter_pal::BroadcastComparison4DSlowWithScaling ( const ComparisonParams op_params,
const luci_interpreter::RuntimeShape unextended_input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape unextended_input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
bool *  output_data,
bool   FT, T 
)
inline

Definition at line 69 of file PALComparisons.h.

74{
75 const BroadcastComparison4DSlowCommon dims = BroadcastComparison4DSlowPreprocess(
76 unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
77
78 int left_shift = op_params.left_shift;
79 int32_t input1_offset = op_params.input1_offset;
80 int32_t input1_multiplier = op_params.input1_multiplier;
81 int input1_shift = op_params.input1_shift;
82 int32_t input2_offset = op_params.input2_offset;
83 int32_t input2_multiplier = op_params.input2_multiplier;
84 int input2_shift = op_params.input2_shift;
85
86 for (int b = 0; b < dims.output_shape.dims(0); ++b)
87 {
88 for (int y = 0; y < dims.output_shape.dims(1); ++y)
89 {
90 for (int x = 0; x < dims.output_shape.dims(2); ++x)
91 {
92 for (int c = 0; c < dims.output_shape.dims(3); ++c)
93 {
94 const int32_t input1_val =
95 input1_offset + input1_data[subscriptToIndex(dims.desc1, b, y, x, c)];
96 const int32_t input2_val =
97 input2_offset + input2_data[subscriptToIndex(dims.desc2, b, y, x, c)];
98 const int32_t shifted_input1_val = input1_val * (1 << left_shift);
99 const int32_t shifted_input2_val = input2_val * (1 << left_shift);
100 const int32_t scaled_input1_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
101 shifted_input1_val, input1_multiplier, input1_shift);
102 const int32_t scaled_input2_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
103 shifted_input2_val, input2_multiplier, input2_shift);
104
105 const int output_data_offset =
106 ((b * dims.output_shape.dims(1) + y) * dims.output_shape.dims(2) + x) *
107 dims.output_shape.dims(3) +
108 c;
109 output_data[output_data_offset] = F(scaled_input1_val, scaled_input2_val);
110 }
111 }
112 }
113 }
114}
int32_t multiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition PALUtils.h:85

References luci_interpreter_pal::ComparisonParams::input1_multiplier, luci_interpreter_pal::ComparisonParams::input1_offset, luci_interpreter_pal::ComparisonParams::input1_shift, luci_interpreter_pal::ComparisonParams::input2_multiplier, luci_interpreter_pal::ComparisonParams::input2_offset, luci_interpreter_pal::ComparisonParams::input2_shift, luci_interpreter_pal::ComparisonParams::left_shift, multiplyByQuantizedMultiplierSmallerThanOneExp(), and subscriptToIndex().

◆ BroadcastDiv4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastDiv4DSlow ( const ArithmeticParams params,
const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 41 of file PALDiv.h.

45{
46 BroadcastArithmeticOp4DSlow<T, DivFn<T>>(params, input1_shape, input1_data, input2_shape,
47 input2_data, output_shape, output_data);
48}

References output_shape.

◆ BroadcastFloorDiv4DSlow()

void luci_interpreter_pal::BroadcastFloorDiv4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const float *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const float *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 33 of file PALFloorDivCommon.h.

38{
39 BroadcastBinaryOp4DSlow<float, FloorDivFn<float>>(input1_shape, input1_data, input2_shape,
40 input2_data, output_shape, output_data);
41}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloorDiv().

◆ BroadcastFloorMod4DSlow()

void luci_interpreter_pal::BroadcastFloorMod4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const float *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const float *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 33 of file PALFloorModCommon.h.

38{
39 BroadcastBinaryOp4DSlow<float, FloorModFn<float>>(input1_shape, input1_data, input2_shape,
40 input2_data, output_shape, output_data);
41}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloorMod().

◆ BroadcastImpl()

template<int N>
void luci_interpreter_pal::BroadcastImpl ( const NdArrayDesc< N > &  input_desc,
const uint8_t *  input_data,
const NdArrayDesc< N > &  output_desc,
uint8_t *  output_data,
int  indexes[N],
int  dim,
const int  last_broadcasting_dim,
const uint32_t  type_size 
)

Definition at line 30 of file PALBroadcastTo.h.

33{
34 // Copy data from input to output.
35 if (dim == last_broadcasting_dim)
36 {
37 int copy_size = output_desc.strides[dim] * type_size;
38 const uint8_t *data_src = input_data + subscriptToIndex(input_desc, indexes) * type_size;
39 uint8_t *data_dst = output_data + subscriptToIndex(output_desc, indexes) * type_size;
40 for (int i = 0; i < output_desc.extents[dim]; ++i, data_dst += copy_size)
41 {
42 memcpy(data_dst, data_src, copy_size);
43 }
44 return;
45 }
46
47 // Recursive call to find the next broadcasting.
48 for (indexes[dim] = 0; indexes[dim] < input_desc.extents[dim]; ++indexes[dim])
49 {
50 BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, dim + 1,
51 last_broadcasting_dim, type_size);
52 }
53
54 // Duplicate data in output tensor.
55 indexes[dim] = 0;
56 if (input_desc.extents[dim] != output_desc.extents[dim])
57 {
58 int copy_size = output_desc.strides[dim] * type_size;
59 uint8_t *data_src = output_data + subscriptToIndex(output_desc, indexes) * type_size;
60 uint8_t *data_dst = data_src + copy_size;
61 for (int i = 1; i < output_desc.extents[dim]; ++i, data_dst += copy_size)
62 {
63 memcpy(data_dst, data_src, copy_size);
64 }
65 }
66}

References luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter_pal::NdArrayDesc< N >::strides, and subscriptToIndex().

◆ BroadcastMaximum4DSlow()

void luci_interpreter_pal::BroadcastMaximum4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const float *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const float *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 32 of file PALMaximumCommon.h.

35{
36 BroadcastBinaryOp4DSlow<float, MaximumFn<float>>(input1_shape, input1_data, input2_shape,
37 input2_data, output_shape, output_data);
38}

References output_shape.

Referenced by luci_interpreter::execute_kernel_CircleMaximum().

◆ BroadcastMinimum4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastMinimum4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 33 of file PALMinimumCommon.h.

36{
37 BroadcastBinaryOp4DSlow<float, MinimumFn<float>>(input1_shape, input1_data, input2_shape,
38 input2_data, output_shape, output_data);
39}

References output_shape.

◆ BroadcastMul4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastMul4DSlow ( const ArithmeticParams params,
const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 41 of file PALMulCommon.h.

45{
46 BroadcastArithmeticOp4DSlow<T, MulFn<T>>(params, input1_shape, input1_data, input2_shape,
47 input2_data, output_shape, output_data);
48}

References BroadcastMul4DSlow(), and output_shape.

Referenced by BroadcastMul4DSlow().

◆ BroadcastPrelu4DSlowFloat()

void luci_interpreter_pal::BroadcastPrelu4DSlowFloat ( const luci_interpreter::RuntimeShape unextended_input1_shape,
const float *  input1_data,
const luci_interpreter::RuntimeShape unextended_input2_shape,
const float *  input2_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
float *  output_data 
)

Definition at line 28 of file PALPreluCommon.h.

34{
36 luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
37
40 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
41 &desc2);
42
43 for (int b = 0; b < output_shape.dims(0); ++b)
44 {
45 for (int y = 0; y < output_shape.dims(1); ++y)
46 {
47 for (int x = 0; x < output_shape.dims(2); ++x)
48 {
49 for (int c = 0; c < output_shape.dims(3); ++c)
50 {
51 auto out_idx = offset(output_shape.dimsData(), b, y, x, c);
52 auto in1_idx = subscriptToIndex(desc1, b, y, x, c);
53 auto in2_idx = subscriptToIndex(desc2, b, y, x, c);
54 auto in1_val = input1_data[in1_idx];
55 auto in2_val = input2_data[in2_idx];
56 output_data[out_idx] = in1_val >= 0.0f ? in1_val : in1_val * in2_val;
57 }
58 }
59 }
60 }
61}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), offset(), output_shape, and subscriptToIndex().

Referenced by luci_interpreter::execute_kernel_CirclePRelu().

◆ BroadcastSub4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastSub4DSlow ( const ArithmeticParams params,
const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 33 of file PALSub.h.

37{
38 BroadcastArithmeticOp4DSlow<T, SubFn<T>>(params, input1_shape, input1_data, input2_shape,
39 input2_data, output_shape, output_data);
40}

References output_shape.

◆ BroadcastTISO4DSlow()

template<typename T >
void luci_interpreter_pal::BroadcastTISO4DSlow ( const luci_interpreter::RuntimeShape input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape input2_shape,
const T *  input2_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data,
std::function< const T &(const T &, const T &)>  func 
)
inline

Definition at line 27 of file Broadcast.h.

31{
34 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
35
36 const luci_interpreter::RuntimeShape extended_output_shape =
38
39 // In Tensorflow, the dimensions are canonically named (batch_number, row,
40 // col, channel), with extents (batches, height, width, depth), with the
41 // trailing dimension changing most rapidly (channels has the smallest stride,
42 // typically 1 element).
43 //
44 // In generated C code, we store arrays with the dimensions reversed. The
45 // first dimension has smallest stride.
46 //
47 // We name our variables by their Tensorflow convention, but generate C code
48 // nesting loops such that the innermost loop has the smallest stride for the
49 // best cache behavior.
50
51 for (int b = 0; b < extended_output_shape.dims(0); ++b)
52 {
53 for (int y = 0; y < extended_output_shape.dims(1); ++y)
54 {
55 for (int x = 0; x < extended_output_shape.dims(2); ++x)
56 {
57 for (int c = 0; c < extended_output_shape.dims(3); ++c)
58 {
59 const int output_data_offset =
60 ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
61 extended_output_shape.dims(3) +
62 c;
63
64 output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
65 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
66 }
67 }
68 }
69 }
70}

References desc1, desc2, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), NdArrayDescsForElementwiseBroadcast(), output_shape, and subscriptToIndex().

◆ BroadcastTo()

template<int N>
void luci_interpreter_pal::BroadcastTo ( const luci_interpreter::RuntimeShape unextended_input_shape,
const uint8_t *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
uint8_t *  output_data,
luci_interpreter::DataType  data_type 
)
inline

Definition at line 69 of file PALBroadcastTo.h.

73{
74 NdArrayDesc<N> input_desc;
75 NdArrayDesc<N> output_desc;
77 &input_desc);
79 &output_desc);
80
81 // Get the last dimension has broadcasting. At this dimension, the data is
82 // copied from input tensor to output tensor.
83 int last_broadcast_dim = -1;
84 for (int i = N - 1; i >= 0; --i)
85 {
86 if (input_desc.extents[i] != output_desc.extents[i])
87 {
88 last_broadcast_dim = i;
89 break;
90 }
91 }
92
93 // If non-broadcasting, just copy data from input to output tensor.
94 if (last_broadcast_dim == -1)
95 {
96 memcpy(output_data, input_data, unextended_input_shape.flatSize() * sizeof(data_type));
97 return;
98 }
99
100 // Broadcasting using memcpy.
101 int indexes[N] = {0};
102 BroadcastImpl<N>(input_desc, input_data, output_desc, output_data, indexes, 0, last_broadcast_dim,
103 luci_interpreter::size(data_type));
104}
void copyDimsToDesc(const luci_interpreter::RuntimeShape &input_shape, NdArrayDesc< N > *desc_out)
uint32_t size(loco::DataType data_type)
Returns the size of the data type.

References copyDimsToDesc(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter::RuntimeShape::flatSize(), and luci::size().

◆ calculateGRU()

void luci_interpreter_pal::calculateGRU ( const float *  input_data,
const float *  weight_input_data,
const float *  weight_hidden_data,
const float *  bias_input_data,
const float *  bias_hidden_data,
float *  output_data,
const tflite::RuntimeShape &  input_shape,
const tflite::RuntimeShape &  output_shape,
const tflite::RuntimeShape &  weight_input_shape,
const tflite::RuntimeShape &  weight_hidden_shape,
float *  output_input_data,
float *  output_hidden_data,
const tflite::RuntimeShape &  output_shape_fc 
)

Definition at line 59 of file PALGRU.h.

66{
67 tflite::FullyConnectedParams op_params{};
68 // As FC nodes doesn't have any activations inside GRU, let' use just numeric limits
69 op_params.float_activation_min = std::numeric_limits<float>::lowest();
70 op_params.float_activation_max = std::numeric_limits<float>::max();
71
72 // FC Input
73 tflite::RuntimeShape bias_input_shape{weight_input_shape.Dims(0)};
74 tflite::reference_ops::FullyConnected(op_params, output_shape, output_data, weight_input_shape,
75 weight_input_data, bias_input_shape, bias_input_data,
76 output_shape_fc, output_input_data);
77
78 // FC Hidden
79 tflite::RuntimeShape bias_hidden_shape{weight_hidden_shape.Dims(0)};
80 // Note: input for this FC node will be saved without intermediate buffer
81 tflite::reference_ops::FullyConnected(op_params, input_shape, input_data, weight_hidden_shape,
82 weight_hidden_data, bias_hidden_shape, bias_hidden_data,
83 output_shape_fc, output_hidden_data);
84
85 int num_elements = output_shape_fc.Dims(1) / 3;
86
87 float *second_hidden_part = output_hidden_data + num_elements;
88 float *second_input_part = output_input_data + num_elements;
89
90 float *third_hidden_part = second_hidden_part + num_elements;
91 float *third_input_part = second_input_part + num_elements;
92
93 // Calculate Left part
94 for (int i = 0; i < num_elements; ++i)
95 {
96 output_input_data[i] += output_hidden_data[i];
97 }
98
99 Logistic(num_elements, output_input_data, output_input_data);
100
101 // Calculate most left mul
102 float *most_left_part_final = output_input_data;
103 float *first_part = output_input_data;
104 for (int i = 0; i < num_elements; ++i)
105 {
106 output_data[i] *= most_left_part_final[i];
107 first_part[i] = 1.0f - first_part[i];
108 }
109
110 // Calc second part
111 for (int i = 0; i < num_elements; ++i)
112 {
113 second_hidden_part[i] += second_input_part[i];
114 }
115
116 Logistic(num_elements, second_hidden_part, second_hidden_part);
117
118 for (int i = 0; i < num_elements; ++i)
119 {
120 second_hidden_part[i] *= third_input_part[i];
121 second_hidden_part[i] += third_hidden_part[i];
122 }
123
124 for (int i = 0; i < num_elements; ++i)
125 {
126 if (second_hidden_part[i] > 19)
127 {
128 second_hidden_part[i] = 1;
129 }
130 else if (second_hidden_part[i] < -19)
131 {
132 second_hidden_part[i] = -1;
133 }
134 else
135 {
136 second_hidden_part[i] = std::tanh(second_hidden_part[i]);
137 }
138 }
139
140 for (int i = 0; i < num_elements; ++i)
141 {
142 second_hidden_part[i] *= first_part[i];
143 output_data[i] += second_hidden_part[i];
144 }
145}
uint32_t num_elements(const Shape &shape)
The number of elements of a feature map of a given shape.
Definition Shape.h:59
void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
Definition Logistic.h:32

References Logistic(), and output_shape.

Referenced by GRU().

◆ Ceil()

void luci_interpreter_pal::Ceil ( const int32_t  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 28 of file PALCeil.h.

29{
30 for (int i = 0; i < flat_size; ++i)
31 {
32 output_data[i] = std::ceil(input_data[i]);
33 }
34}

Referenced by luci_interpreter::execute_kernel_CircleCeil().

◆ ComparisonNoScaling()

template<typename T >
void luci_interpreter_pal::ComparisonNoScaling ( const int64_t  flat_size,
const T *  input1_data,
const T *  input2_data,
bool *  output_data,
bool   FT, T 
)
inline

Definition at line 59 of file PALComparisons.h.

61{
62 for (int64_t i = 0; i < flat_size; ++i)
63 {
64 output_data[i] = F(input1_data[i], input2_data[i]);
65 }
66}

◆ ComparisonWithScaling()

template<typename T >
void luci_interpreter_pal::ComparisonWithScaling ( const ComparisonParams op_params,
const int64_t  flat_size,
const T *  input1_data,
const T *  input2_data,
bool *  output_data,
bool   FT, T 
)
inline

Definition at line 117 of file PALComparisons.h.

120{
121 int left_shift = op_params.left_shift;
122 int32_t input1_offset = op_params.input1_offset;
123 int32_t input1_multiplier = op_params.input1_multiplier;
124 int input1_shift = op_params.input1_shift;
125 int32_t input2_offset = op_params.input2_offset;
126 int32_t input2_multiplier = op_params.input2_multiplier;
127 int input2_shift = op_params.input2_shift;
128
129 for (int64_t i = 0; i < flat_size; ++i)
130 {
131 const int32_t input1_val = input1_offset + input1_data[i];
132 const int32_t input2_val = input2_offset + input2_data[i];
133 const int32_t shifted_input1_val = input1_val * (1 << left_shift);
134 const int32_t shifted_input2_val = input2_val * (1 << left_shift);
135 const int32_t scaled_input1_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
136 shifted_input1_val, input1_multiplier, input1_shift);
137 const int32_t scaled_input2_val = multiplyByQuantizedMultiplierSmallerThanOneExp(
138 shifted_input2_val, input2_multiplier, input2_shift);
139 output_data[i] = F(scaled_input1_val, scaled_input2_val);
140 }
141}

References luci_interpreter_pal::ComparisonParams::input1_multiplier, luci_interpreter_pal::ComparisonParams::input1_offset, luci_interpreter_pal::ComparisonParams::input1_shift, luci_interpreter_pal::ComparisonParams::input2_multiplier, luci_interpreter_pal::ComparisonParams::input2_offset, luci_interpreter_pal::ComparisonParams::input2_shift, luci_interpreter_pal::ComparisonParams::left_shift, and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ ComputeInterpolationValues()

void luci_interpreter_pal::ComputeInterpolationValues ( const float  value,
const float  scale,
const bool  half_pixel_centers,
int32_t  input_size,
float *  scaled_value,
int32_t *  lower_bound,
int32_t *  upper_bound 
)
inline

Definition at line 39 of file PALResizeBilinear.h.

43{
44 if (half_pixel_centers)
45 {
46 *scaled_value = (value + 0.5f) * scale - 0.5f;
47 }
48 else
49 {
50 *scaled_value = value * scale;
51 }
52 float scaled_value_floor = std::floor(*scaled_value);
53 *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
54 *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
55}

◆ Concatenation()

template<typename Scalar >
void luci_interpreter_pal::Concatenation ( const ConcatenationParams params,
const luci_interpreter::RuntimeShape *const *  input_shapes,
const Scalar *const *  input_data,
const luci_interpreter::RuntimeShape output_shape,
Scalar *  output_data 
)
inline

Definition at line 28 of file PALConcatenation.h.

32{
33 int axis = params.axis;
34 int inputs_count = params.inputs_count;
35 const int concat_dimensions = output_shape.dimensionsCount();
36
37 int64_t concat_size = 0;
38 for (int i = 0; i < inputs_count; i++)
39 {
40 concat_size += input_shapes[i]->dims(axis);
41 }
42 int64_t outer_size = 1;
43 for (int i = 0; i < axis; ++i)
44 {
45 outer_size *= output_shape.dims(i);
46 }
47 // For all input arrays,
48 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
49 int64_t base_inner_size = 1;
50 for (int i = axis + 1; i < concat_dimensions; ++i)
51 {
52 base_inner_size *= output_shape.dims(i);
53 }
54
55 Scalar *output_ptr = output_data;
56 for (int k = 0; k < outer_size; k++)
57 {
58 for (int i = 0; i < inputs_count; ++i)
59 {
60 const int copy_size = input_shapes[i]->dims(axis) * base_inner_size;
61 const Scalar *input_ptr = input_data[i] + k * copy_size;
62 memcpy(output_ptr, input_ptr, copy_size * sizeof(Scalar));
63 output_ptr += copy_size;
64 }
65 }
66}
list input_data
Definition infer.py:29

References luci_interpreter_pal::ConcatenationParams::axis, luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::ConcatenationParams::inputs_count, and output_shape.

◆ copyDimsToDesc()

template<int N>
void luci_interpreter_pal::copyDimsToDesc ( const luci_interpreter::RuntimeShape input_shape,
NdArrayDesc< N > *  desc_out 
)
inline

Definition at line 47 of file ProcessBroadcastShapes.h.

49{
50 int desc_stride = 1;
51 for (int i = N - 1; i >= 0; --i)
52 {
53 desc_out->extents[i] = input_shape.dims(i);
54 desc_out->strides[i] = desc_stride;
55 desc_stride *= input_shape.dims(i);
56 }
57}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::NdArrayDesc< N >::extents, and luci_interpreter_pal::NdArrayDesc< N >::strides.

Referenced by BroadcastTo(), and TransposeImpl().

◆ Cos()

void luci_interpreter_pal::Cos ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 27 of file PALCosCommon.h.

28{
29 for (int i = 0; i < flat_size; ++i)
30 {
31 output_data[i] = std::cos(input_data[i]);
32 }
33}

Referenced by luci_interpreter::execute_kernel_CircleCos().

◆ DepthToSpace()

template<typename T >
void luci_interpreter_pal::DepthToSpace ( const int32_t  block_size,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 29 of file PALDepthToSpace.h.

32{
33 const luci_interpreter::RuntimeShape input_shape =
34 luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
36 luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
37
38 const int output_depth = output_shape.dims(3);
39 const int output_width = output_shape.dims(2);
40 const int output_height = output_shape.dims(1);
41 const int output_batch = output_shape.dims(0);
42
43 for (int out_b = 0; out_b < output_batch; ++out_b)
44 {
45 for (int out_h = 0; out_h < output_height; ++out_h)
46 {
47 for (int out_w = 0; out_w < output_width; ++out_w)
48 {
49 for (int out_d = 0; out_d < output_depth; ++out_d)
50 {
51 const int in_d =
52 out_d + ((out_h % block_size) * block_size + out_w % block_size) * output_depth;
53
54 const int in_w = out_w / block_size;
55 const int in_h = out_h / block_size;
56 const int in_b = out_b;
57
58 const int input_index = offset(input_shape.dimsData(), in_b, in_h, in_w, in_d);
59 const int output_index = offset(output_shape.dimsData(), out_b, out_h, out_w, out_d);
60
61 output_data[output_index] = input_data[input_index];
62 }
63 }
64 }
65 }
66}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), and output_shape.

◆ DepthwiseConvPerChannel< int8_t >() [1/3]

template<>
void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t > ( const tflite::DepthwiseParams &  params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 58 of file PALDepthwiseConv2d.h.

65{
66 if (scratchpad_data)
67 {
68 cmsis_nn_dw_conv_params dw_conv_params;
69 dw_conv_params.dilation.h = params.dilation_height_factor;
70 dw_conv_params.dilation.w = params.dilation_width_factor;
71 assert(dw_conv_params.dilation.h == 1);
72 assert(dw_conv_params.dilation.w == 1);
73
74 dw_conv_params.input_offset = params.input_offset;
75 dw_conv_params.output_offset = params.output_offset;
76 dw_conv_params.stride.h = params.stride_height;
77 dw_conv_params.stride.w = params.stride_width;
78 dw_conv_params.padding.h = params.padding_values.height;
79 dw_conv_params.padding.w = params.padding_values.width;
80
81 dw_conv_params.activation.min = params.quantized_activation_min;
82 dw_conv_params.activation.max = params.quantized_activation_max;
83 dw_conv_params.ch_mult = params.depth_multiplier;
84
85 cmsis_nn_per_channel_quant_params quant_params;
86 int32_t output_multiplier = params.output_multiplier;
87 int32_t output_shift = params.output_shift;
88
89 quant_params.multiplier = &output_multiplier;
90 quant_params.shift = &output_shift;
91
92 assert(dw_conv_params.activation.min <= dw_conv_params.activation.max);
93 const int batch_size = tflite::MatchingDim(input_shape, 0, output_shape, 0);
94 const int output_depth = tflite::MatchingDim(filter_shape, 3, output_shape, 3);
95 if (bias_data)
96 {
97 assert(bias_shape.FlatSize() == output_depth);
98 }
99
100 cmsis_nn_dims input_dims;
101 input_dims.n = batch_size;
102 input_dims.h = input_shape.Dims(1);
103 input_dims.w = input_shape.Dims(2);
104 input_dims.c = input_shape.Dims(3);
105
106 cmsis_nn_dims filter_dims;
107 filter_dims.n = filter_shape.Dims(0);
108 filter_dims.h = filter_shape.Dims(1);
109 filter_dims.w = filter_shape.Dims(2);
110 filter_dims.c = output_depth;
111
112 cmsis_nn_dims bias_dims;
113 bias_dims.n = 1;
114 bias_dims.h = 1;
115 bias_dims.w = 1;
116 bias_dims.c = output_depth;
117
118 cmsis_nn_dims output_dims;
119 output_dims.n = batch_size;
120 output_dims.h = output_shape.Dims(1);
121 output_dims.w = output_shape.Dims(2);
122 output_dims.c = output_depth;
123
124 cmsis_nn_context ctx;
125 ctx.buf = scratchpad_data;
126 ctx.size = scratchpad_shape.Dims(0);
127
128 auto res = arm_depthwise_conv_wrapper_s8(&ctx, &dw_conv_params, &quant_params, &input_dims,
129 input_data, &filter_dims, filter_data, &bias_dims,
130 bias_data, &output_dims, output_data);
131 assert(res == ARM_MATH_SUCCESS);
132 }
133 else
134 {
135 tflite::reference_integer_ops::DepthwiseConvPerChannel(
136 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
137 bias_shape, bias_data, output_shape, output_data);
138 }
139}

References output_shape.

◆ DepthwiseConvPerChannel< int8_t >() [2/3]

template<>
void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t > ( const tflite::DepthwiseParams &  params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 57 of file PALDepthwiseConv2d.h.

64{
65 (void)scratchpad_shape;
66 (void)scratchpad_data;
67 tflite::reference_integer_ops::DepthwiseConvPerChannel(
68 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
69 bias_shape, bias_data, output_shape, output_data);
70}

References output_shape.

◆ DepthwiseConvPerChannel< int8_t >() [3/3]

template<>
void luci_interpreter_pal::DepthwiseConvPerChannel< int8_t > ( const tflite::DepthwiseParams &  params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data,
const tflite::RuntimeShape &  scratchpad_shape,
int8_t *  scratchpad_data 
)
inline

Definition at line 57 of file PALDepthwiseConv2d.h.

64{
65 (void)scratchpad_shape;
66 (void)scratchpad_data;
67 tflite::reference_integer_ops::DepthwiseConvPerChannel(
68 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
69 bias_shape, bias_data, output_shape, output_data);
70}

References output_shape.

◆ Dequantize()

template<typename InputT , typename OutputT >
void luci_interpreter_pal::Dequantize ( const QuantizationParams op_params,
const int  flat_size,
const InputT *  input_data,
OutputT *  output_data 
)
inline

Definition at line 27 of file PALDequantize.h.

29{
30 const int32_t zero_point = op_params.zero_point;
31 const double scale = op_params.scale;
32
33 for (int i = 0; i < flat_size; i++)
34 {
35 const int32_t val = input_data[i];
36 const OutputT result = static_cast<OutputT>(scale * (val - zero_point));
37 output_data[i] = result;
38 }
39}

References luci_interpreter_pal::QuantizationParams::scale, and luci_interpreter_pal::QuantizationParams::zero_point.

◆ Div()

template<typename T >
void luci_interpreter_pal::Div ( const ArithmeticParams params,
const int  flat_size,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 26 of file PALDiv.h.

28{
29 ArithmeticOp<T, DivFn<T>>(params, flat_size, input1_data, input2_data, output_data);
30}

◆ DivScalar()

template<typename T >
void luci_interpreter_pal::DivScalar ( const ArithmeticParams params,
const int  flat_size,
const T *  input_data,
const T  scalar_value,
T *  output_data 
)
inline

Definition at line 33 of file PALDiv.h.

35{
36 ArithmeticOpScalar<T, DivFn<T>>(params, flat_size, input_data, scalar_value, output_data);
37}

◆ Elu()

void luci_interpreter_pal::Elu ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 27 of file PALElu.h.

28{
29 for (int i = 0; i < flat_size; i++)
30 {
31 float val = input_data[i];
32 float result = val < 0.0f ? std::expm1(val) : val;
33 output_data[i] = result;
34 }
35}

References Elu().

Referenced by Elu().

◆ EqualFn()

template<typename T >
bool luci_interpreter_pal::EqualFn ( lhs,
rhs 
)
inline

Definition at line 53 of file PALComparisons.h.

53{ return lhs == rhs; }

Referenced by luci_interpreter::execute_kernel_CircleEqual().

◆ eval_integer_8x8_16_lstm()

void luci_interpreter_pal::eval_integer_8x8_16_lstm ( const luci_interpreter::Tensor input,
const luci_interpreter::Tensor input_to_input_weights,
const luci_interpreter::Tensor input_to_forget_weights,
const luci_interpreter::Tensor input_to_cell_weights,
const luci_interpreter::Tensor input_to_output_weights,
const luci_interpreter::Tensor recurrent_to_input_weights,
const luci_interpreter::Tensor recurrent_to_forget_weights,
const luci_interpreter::Tensor recurrent_to_cell_weights,
const luci_interpreter::Tensor recurrent_to_output_weights,
const luci_interpreter::Tensor cell_to_input_weights,
const luci_interpreter::Tensor cell_to_forget_weights,
const luci_interpreter::Tensor cell_to_output_weights,
const luci_interpreter::Tensor input_layer_norm_coefficients,
const luci_interpreter::Tensor forget_layer_norm_coefficients,
const luci_interpreter::Tensor cell_layer_norm_coefficients,
const luci_interpreter::Tensor output_layer_norm_coefficients,
const luci_interpreter::Tensor input_gate_bias,
const luci_interpreter::Tensor forget_gate_bias,
const luci_interpreter::Tensor cell_gate_bias,
const luci_interpreter::Tensor output_gate_bias,
const luci_interpreter::Tensor projection_weights,
const luci_interpreter::Tensor projection_bias,
const luci_interpreter::UnidirectionalSequenceLSTMParams params,
bool  forward_sequence,
bool  time_major,
const luci_interpreter::IntegerLSTMParams &  integer_lstm_param,
int32_t  output_state_zp,
luci_interpreter::Tensor output_state,
luci_interpreter::Tensor cell_state,
luci_interpreter::Tensor output,
int16_t *  scratch0,
int16_t *  scratch1,
int16_t *  scratch2,
int16_t *  scratch3,
int8_t *  scratch4,
int32_t *  scratch5 
)

Definition at line 126 of file PALUnidirectionalSequenceLSTM.h.

151{
152 // CMSIS-NN does not support these configurations currently.
153 // Please use MCU kernels instead
154 const bool use_layer_norm = (forget_layer_norm_coefficients != nullptr);
155 const bool use_peephole = (cell_to_output_weights != nullptr);
156 const bool use_projection = (projection_weights != nullptr);
157 const bool use_cifg = (input_to_input_weights == nullptr);
158 const bool unsupported_config = use_layer_norm || use_peephole || use_projection || use_cifg;
159
160 if (unsupported_config)
161 {
162 assert(false && "CMSIS-NN does not support these configurations currently");
163 return;
164 }
165
166 const auto input_shape = input->shape();
167 LUCI_INTERPRETER_CHECK(input_shape.num_dims() >= 2 && input_shape.num_dims() <= 3);
168
169 cmsis_nn_lstm_context scratch_buffers;
170 scratch_buffers.input_gate = scratch0;
171 scratch_buffers.forget_gate = scratch1;
172 scratch_buffers.cell_gate = scratch2;
173 scratch_buffers.output_gate = scratch3;
174 scratch_buffers.scratch = scratch4;
175
176 cmsis_nn_lstm_params cmsis_lstm_params = lstm::convert_lstm_params(
177 integer_lstm_param, time_major, output_state_zp,
178 luci_interpreter::kernels::getTensorData<int32_t>(input_gate_bias),
179 luci_interpreter::kernels::getTensorData<int32_t>(forget_gate_bias),
180 luci_interpreter::kernels::getTensorData<int32_t>(cell_gate_bias),
181 luci_interpreter::kernels::getTensorData<int32_t>(output_gate_bias),
182 const_cast<int16_t *>(
183 luci_interpreter::kernels::getTensorData<int16_t>(input_layer_norm_coefficients)),
184 const_cast<int16_t *>(
185 luci_interpreter::kernels::getTensorData<int16_t>(forget_layer_norm_coefficients)),
186 const_cast<int16_t *>(
187 luci_interpreter::kernels::getTensorData<int16_t>(cell_layer_norm_coefficients)),
188 const_cast<int16_t *>(
189 luci_interpreter::kernels::getTensorData<int16_t>(output_layer_norm_coefficients)));
190
191 const int n_input = input_shape.dim(input_shape.num_dims() - 1);
192 int max_time, n_batch;
193 if (input_shape.num_dims() == 2)
194 {
195 max_time = 1;
196 n_batch = input_shape.dim(0);
197 }
198 else
199 {
200 max_time = (time_major) ? input_shape.dim(0) : input_shape.dim(1);
201 n_batch = (time_major) ? input_shape.dim(1) : input_shape.dim(0);
202 }
203
204 // n_cell and n_output will be the same size when there is no projection.
205 const int n_cell = input_to_output_weights->shape().dim(0);
206 const int n_output = recurrent_to_output_weights->shape().dim(1);
207
208 cmsis_nn_lstm_dims lstm_dims;
209 lstm_dims.num_inputs = n_input;
210 lstm_dims.num_outputs = n_output;
211 lstm_dims.num_batches = n_batch;
212 lstm_dims.max_time = max_time;
213
214 arm_lstm_unidirectional_s16_s8(
215 &scratch_buffers, const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input)),
216 &lstm_dims,
217 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_input_weights)),
218 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_forget_weights)),
219 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_cell_weights)),
220 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(input_to_output_weights)),
221 const_cast<int8_t *>(
222 luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_input_weights)),
223 const_cast<int8_t *>(
224 luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_forget_weights)),
225 const_cast<int8_t *>(
226 luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_cell_weights)),
227 const_cast<int8_t *>(
228 luci_interpreter::kernels::getTensorData<int8_t>(recurrent_to_output_weights)),
229 const_cast<int16_t *>(luci_interpreter::kernels::getTensorData<int16_t>(cell_to_input_weights)),
230 const_cast<int16_t *>(
231 luci_interpreter::kernels::getTensorData<int16_t>(cell_to_forget_weights)),
232 const_cast<int16_t *>(
233 luci_interpreter::kernels::getTensorData<int16_t>(cell_to_output_weights)),
234 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(projection_weights)),
235 &cmsis_lstm_params,
236 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(output_state)),
237 const_cast<int16_t *>(luci_interpreter::kernels::getTensorData<int16_t>(cell_state)),
238 const_cast<int8_t *>(luci_interpreter::kernels::getTensorData<int8_t>(output)));
239}
int32_t dim(int i) const
Definition Tensor.h:41
const Shape & shape() const
Definition Tensor.h:107
#define LUCI_INTERPRETER_CHECK(cond)
Definition Utils.h:36

References luci_interpreter_pal::lstm::convert_lstm_params(), luci_interpreter::Shape::dim(), LUCI_INTERPRETER_CHECK, and luci_interpreter::Tensor::shape().

◆ evalLSTM()

template<typename ActivationType , typename WeightType , typename CellType , typename BiasType >
void luci_interpreter_pal::evalLSTM ( luci_interpreter::lstm::LSTMStruct lstm_struct,
luci_interpreter::lstm::LSTMParameters lstm_params,
luci_interpreter::lstm::CellStateInfo cell_state_info,
ActivationType *  output_state_data,
CellType *  cell_state_data,
CellType *  scratch0,
CellType *  scratch1,
CellType *  scratch2,
CellType *  scratch3,
luci_interpreter::BaseRuntimeGraph runtime_graph 
)

Definition at line 515 of file PALUnidirectionalSequenceLSTMCommon.h.

521{
523
524 size_info.time_major = lstm_struct->options->time_major();
525 size_info.batch_size = size_info.time_major
526 ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1)
527 : luci_interpreter::Tensor::dim(lstm_struct->input(), 0);
528 size_info.time_steps = size_info.time_major
529 ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0)
530 : luci_interpreter::Tensor::dim(lstm_struct->input(), 1);
531 size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2);
532 size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1);
533
534 lstm_internal::LstmStepManager step_info(size_info);
535
536 // time is the first dimention, enable batch computation
537 if (size_info.time_major)
538 {
539 for (int t = 0; t < size_info.time_steps; t++)
540 {
541 lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
542 lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
543 scratch0, scratch1, scratch2, scratch3, runtime_graph);
544 // prepare for the next time step
545 step_info.updateTime();
546 }
547 }
548 else
549 {
550 // batch first, unable to size the input data. single batch inference
551 for (int b = 0; b < size_info.batch_size; b++)
552 {
553 for (int t = 0; t < size_info.time_steps; t++)
554 {
555 lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
556 lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
557 scratch0, scratch1, scratch2, scratch3, runtime_graph);
558 // prepare for the next time step
559 step_info.updateTime();
560 }
561 // prepare for the next batch
562 step_info.updateBatch();
563 step_info.resetTime();
564 }
565 }
566}
const loco::Dimension & dim(uint32_t axis) const
Definition Tensor.h:44
const circle::UnidirectionalSequenceLSTMOptions * options

References luci_interpreter_pal::lstm_internal::LstmSizeInfo::batch_size, circle_eval_diff::TensorShape::dim(), luci_interpreter::lstm::LSTMStruct::input(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::input_dimension, luci_interpreter::lstm::LSTMStruct::options, luci_interpreter::lstm::LSTMStruct::output_state(), luci_interpreter_pal::lstm_internal::LstmStepManager::resetTime(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::state_dimension, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_major, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_steps, luci_interpreter_pal::lstm_internal::LstmStepManager::updateBatch(), and luci_interpreter_pal::lstm_internal::LstmStepManager::updateTime().

◆ evalLSTM< int8_t, int8_t, int16_t, int32_t >()

template<>
void luci_interpreter_pal::evalLSTM< int8_t, int8_t, int16_t, int32_t > ( luci_interpreter::lstm::LSTMStruct lstm_struct,
luci_interpreter::lstm::LSTMParameters lstm_params,
luci_interpreter::lstm::CellStateInfo cell_state_info,
int8_t *  output_state_data,
int16_t *  cell_state_data,
int16_t *  scratch0,
int16_t *  scratch1,
int16_t *  scratch2,
int16_t *  scratch3,
luci_interpreter::BaseRuntimeGraph runtime_graph 
)

Definition at line 29 of file PALUnidirectionalSequenceLSTM.h.

35{
37
38 size_info.time_major = lstm_struct->options->time_major();
39 size_info.batch_size = size_info.time_major
40 ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1)
41 : luci_interpreter::Tensor::dim(lstm_struct->input(), 0);
42 size_info.time_steps = size_info.time_major
43 ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0)
44 : luci_interpreter::Tensor::dim(lstm_struct->input(), 1);
45 size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2);
46 size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1);
47
48 lstm_internal::LstmStepManager step_info(size_info);
49
50 // time is the first dimention, enable batch computation
51 if (size_info.time_major)
52 {
53 for (int t = 0; t < size_info.time_steps; t++)
54 {
55 lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
56 lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
57 scratch0, scratch1, scratch2, scratch3, runtime_graph);
58 // prepare for the next time step
59 step_info.updateTime();
60 }
61 }
62 else
63 {
64 // batch first, unable to size the input data. single batch inference
65 for (int b = 0; b < size_info.batch_size; b++)
66 {
67 for (int t = 0; t < size_info.time_steps; t++)
68 {
69 lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
70 lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
71 scratch0, scratch1, scratch2, scratch3, runtime_graph);
72 // prepare for the next time step
73 step_info.updateTime();
74 }
75 // prepare for the next batch
76 step_info.updateBatch();
77 step_info.resetTime();
78 }
79 }
80}

References luci_interpreter_pal::lstm_internal::LstmSizeInfo::batch_size, circle_eval_diff::TensorShape::dim(), luci_interpreter::lstm::LSTMStruct::input(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::input_dimension, luci_interpreter::lstm::LSTMStruct::options, luci_interpreter::lstm::LSTMStruct::output_state(), luci_interpreter_pal::lstm_internal::LstmStepManager::resetTime(), luci_interpreter_pal::lstm_internal::LstmSizeInfo::state_dimension, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_major, luci_interpreter_pal::lstm_internal::LstmSizeInfo::time_steps, luci_interpreter_pal::lstm_internal::LstmStepManager::updateBatch(), and luci_interpreter_pal::lstm_internal::LstmStepManager::updateTime().

◆ Exp()

void luci_interpreter_pal::Exp ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 26 of file PALExp.h.

27{
28 for (int i = 0; i < flat_size; i++)
29 {
30 const float val = input_data[i];
31 const float result = std::exp(val);
32 output_data[i] = result;
33 }
34}

Referenced by luci_interpreter::execute_kernel_CircleExp().

◆ flatSizeSkipDim()

int luci_interpreter_pal::flatSizeSkipDim ( const int32_t *  dims_data,
int  skip_dim,
int  num_dims 
)
inline

Definition at line 183 of file PALUtils.h.

184{
185 int flat_size = 1;
186 for (int i = 0; i < num_dims; ++i)
187 {
188 flat_size *= (i == skip_dim) ? 1 : dims_data[i];
189 }
190 return flat_size;
191}

Referenced by FullyConnected(), FullyConnected(), FullyConnected(), FullyConnected< int8_t >(), L2Normalization(), and LogSoftmax().

◆ Floor()

void luci_interpreter_pal::Floor ( const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 25 of file PALFloorCommon.h.

27{
28 // check that input and output dimensions are equal
29 int N = input_shape.dimensionsCount();
30 assert(N == output_shape.dimensionsCount());
31
32 // check that sizes of all dimensions are equal
33 for (int i = 0; i < N; ++i)
34 {
35 assert(input_shape.dims(i) == output_shape.dims(i));
36 }
37
38 const int flat_size = input_shape.flatSize();
39 for (int i = 0; i < flat_size; i++)
40 {
41 int offset = i;
42 output_data[offset] = std::floor(input_data[offset]);
43 }
44}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), offset(), and output_shape.

Referenced by luci_interpreter::execute_kernel_CircleFloor().

◆ FloorDiv()

void luci_interpreter_pal::FloorDiv ( const int  flat_size,
const float *  input1_data,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 25 of file PALFloorDivCommon.h.

27{
28 BinaryOp<float, FloorDivFn<float>>(flat_size, input1_data, input2_data, output_data);
29}

Referenced by luci_interpreter::execute_kernel_CircleFloorDiv().

◆ FloorMod()

void luci_interpreter_pal::FloorMod ( const int  flat_size,
const float *  input1_data,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 25 of file PALFloorModCommon.h.

27{
28 BinaryOp<float, FloorModFn<float>>(flat_size, input1_data, input2_data, output_data);
29}

Referenced by luci_interpreter::execute_kernel_CircleFloorMod().

◆ FullyConnected() [1/5]

template<typename WeightType >
void luci_interpreter_pal::FullyConnected ( const FullyConnectedParams params,
const int32_t *  input_shape,
const float *  input_data,
const int32_t *  filter_shape,
const WeightType *  filter_data,
const float *  bias_data,
const int32_t *  output_shape,
float *  output_data,
uint32_t  output_dims_count,
uint32_t  weights_dims_count 
)
inline

Definition at line 72 of file PALFullyConnectedCommon.h.

77{
78 const float output_activation_min = params.float_activation_min;
79 const float output_activation_max = params.float_activation_max;
80
81 const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
82 const int output_depth = output_shape[output_dims_count - 1];
83 const int accum_depth = filter_shape[weights_dims_count - 1];
84
85 for (int b = 0; b < batches; ++b)
86 {
87 const float *weight_scale_ptr = params.weights_scales;
88 for (int out_c = 0; out_c < output_depth; ++out_c)
89 {
90 float total = 0.f;
91 for (int d = 0; d < accum_depth; ++d)
92 {
93 auto input_value = input_data[b * accum_depth + d];
94 if (std::is_same<WeightType, float>::value)
95 {
96 total += input_value * filter_data[out_c * accum_depth + d];
97 }
98 else
99 {
100 const float filter_scale = *weight_scale_ptr;
101 const float filter_value =
102 static_cast<float>(filter_data[out_c * accum_depth + d]) * filter_scale;
103 total += input_value * filter_value;
104 }
105 }
106 float bias_value = 0.0f;
107 if (bias_data)
108 {
109 bias_value = bias_data[out_c];
110 }
111 output_data[out_c + output_depth * b] =
112 std::min(std::max(total + bias_value, output_activation_min), output_activation_max);
113 if (std::is_same<WeightType, int8_t>::value)
114 {
115 if (params.is_channel_wise_quant)
116 weight_scale_ptr++;
117 }
118 }
119 }
120}
int flatSizeSkipDim(const int32_t *dims_data, int skip_dim, int num_dims)
Definition PALUtils.h:183

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::float_activation_max, luci_interpreter_pal::FullyConnectedParams::float_activation_min, luci_interpreter_pal::FullyConnectedParams::is_channel_wise_quant, output_shape, and luci_interpreter_pal::FullyConnectedParams::weights_scales.

◆ FullyConnected() [2/5]

template<typename InputType , typename WeightType , typename OutputType , typename BiasType >
void luci_interpreter_pal::FullyConnected ( const FullyConnectedParams params,
const int32_t *  input_shape,
const InputType *  input_data,
const int32_t *  filter_shape,
const WeightType *  filter_data,
const BiasType *  bias_data,
const int32_t *  output_shape,
OutputType *  output_data,
uint32_t  output_dims_count,
uint32_t  weights_dims_count 
)
inline

Definition at line 30 of file PALFullyConnectedCommon.h.

35{
36 const int32_t input_offset = params.input_offset;
37 const int32_t filter_offset = params.weights_offset;
38 const int32_t output_offset = params.output_offset;
39 const int32_t output_multiplier = params.output_multiplier;
40 const int output_shift = params.output_shift;
41 const int32_t output_activation_min = params.quantized_activation_min;
42 const int32_t output_activation_max = params.quantized_activation_max;
43
44 const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
45 const int output_depth = output_shape[output_dims_count - 1];
46 const int accum_depth = filter_shape[weights_dims_count - 1];
47
48 for (int b = 0; b < batches; ++b)
49 {
50 for (int out_c = 0; out_c < output_depth; ++out_c)
51 {
52 BiasType acc = 0;
53 for (int d = 0; d < accum_depth; ++d)
54 {
55 int32_t input_val = input_data[b * accum_depth + d];
56 int32_t filter_val = filter_data[out_c * accum_depth + d];
57 acc += (filter_val + filter_offset) * (input_val + input_offset);
58 }
59 if (bias_data)
60 {
61 acc += bias_data[out_c];
62 }
63 int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
64 acc_scaled += output_offset;
65 acc_scaled = std::max(acc_scaled, output_activation_min);
66 acc_scaled = std::min(acc_scaled, output_activation_max);
67 output_data[out_c + output_depth * b] = static_cast<OutputType>(acc_scaled);
68 }
69 }
70}
int32_t multiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition PALUtils.h:77

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::input_offset, multiplyByQuantizedMultiplier(), luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected() [3/5]

template<>
void luci_interpreter_pal::FullyConnected ( const luci_interpreter_pal::FullyConnectedParams ,
const int32_t *  ,
const int16_t *  ,
const int32_t *  ,
const int8_t *  ,
const int64_t *  ,
const int32_t *  ,
int16_t *  ,
uint32_t  ,
uint32_t   
)
inline

Definition at line 46 of file PALFullyConnected.h.

49{
50 // MARK: At this moment this operation doesn't support
51 assert(false && "FullyConnected INT16 NYI");
52}

◆ FullyConnected() [4/5]

template<>
void luci_interpreter_pal::FullyConnected ( const luci_interpreter_pal::FullyConnectedParams params,
const int32_t *  ,
const int16_t *  input_data,
const int32_t *  filter_shape,
const int8_t *  filter_data,
const int64_t *  bias_data,
const int32_t *  output_shape,
int16_t *  output_data,
uint32_t  output_dims_count,
uint32_t  weights_dims_count 
)
inline

Definition at line 89 of file PALFullyConnected.h.

94{
95 const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
96 const int output_depth = output_shape[output_dims_count - 1];
97 const int accum_depth = filter_shape[weights_dims_count - 1];
98
99 cmsis_nn_fc_params fc_params;
100 fc_params.input_offset = params.input_offset;
101 fc_params.output_offset = params.output_offset;
102 fc_params.filter_offset = params.weights_offset;
103 fc_params.activation.min = params.quantized_activation_min;
104 fc_params.activation.max = params.quantized_activation_max;
105
106 cmsis_nn_per_tensor_quant_params quant_params;
107 quant_params.multiplier = params.output_multiplier;
108 quant_params.shift = params.output_shift;
109
110 cmsis_nn_dims input_dims;
111 input_dims.n = batches;
112 input_dims.h = 1;
113 input_dims.w = 1;
114 input_dims.c = accum_depth;
115
116 cmsis_nn_dims filter_dims;
117 filter_dims.n = accum_depth;
118 filter_dims.h = 1;
119 filter_dims.w = 1;
120 filter_dims.c = output_depth;
121
122 cmsis_nn_dims bias_dims;
123 bias_dims.n = 1;
124 bias_dims.h = 1;
125 bias_dims.w = 1;
126 bias_dims.c = output_depth;
127
128 cmsis_nn_dims output_dims;
129 output_dims.n = batches;
130 output_dims.h = 1;
131 output_dims.w = 1;
132 output_dims.c = output_depth;
133
134 int32_t buf_size = arm_fully_connected_s16_get_buffer_size(&filter_dims);
135 auto buffer = std::make_unique<int8_t[]>(buf_size);
136 assert(buffer != nullptr);
137
138 cmsis_nn_context ctx;
139 ctx.buf = buffer.get();
140 ctx.size = buf_size;
141
142 auto res =
143 arm_fully_connected_s16(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
144 filter_data, &bias_dims, bias_data, &output_dims, output_data);
145 assert(res == ARM_CMSIS_NN_SUCCESS);
146}

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::input_offset, luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected() [5/5]

template<>
void luci_interpreter_pal::FullyConnected ( const luci_interpreter_pal::FullyConnectedParams params,
const int32_t *  input_shape,
const int8_t *  input_data,
const int32_t *  filter_shape,
const int8_t *  filter_data,
const int32_t *  bias_data,
const int32_t *  output_shape,
int8_t *  output_data,
uint32_t  ,
uint32_t   
)
inline

Definition at line 27 of file PALFullyConnected.h.

32{
33 // MARK: At this moment this operation doesn't support
34 assert(false && "FullyConnected INT8 NYI");
35 (void)params;
36 (void)input_shape;
37 (void)input_data;
38 (void)filter_shape;
39 (void)filter_data;
40 (void)bias_data;
41 (void)output_shape;
42 (void)output_data;
43}

References output_shape.

◆ FullyConnected< int8_t >() [1/4]

template<>
void luci_interpreter_pal::FullyConnected< int8_t > ( const luci_interpreter_pal::FullyConnectedParams params,
const int32_t *  ,
const int8_t *  input_data,
const int32_t *  filter_shape,
const int8_t *  filter_data,
const int32_t *  bias_data,
const int32_t *  output_shape,
int8_t *  output_data,
uint32_t  output_dims_count,
uint32_t  weights_dims_count 
)
inline

Definition at line 28 of file PALFullyConnected.h.

34{
35 const int batches = flatSizeSkipDim(output_shape, output_dims_count - 1, output_dims_count);
36 const int output_depth = output_shape[output_dims_count - 1];
37 const int accum_depth = filter_shape[weights_dims_count - 1];
38
39 cmsis_nn_fc_params fc_params;
40 fc_params.input_offset = params.input_offset;
41 fc_params.output_offset = params.output_offset;
42 fc_params.filter_offset = params.weights_offset;
43 fc_params.activation.min = params.quantized_activation_min;
44 fc_params.activation.max = params.quantized_activation_max;
45
46 cmsis_nn_per_tensor_quant_params quant_params;
47 quant_params.multiplier = params.output_multiplier;
48 quant_params.shift = params.output_shift;
49
50 cmsis_nn_dims input_dims;
51 input_dims.n = batches;
52 input_dims.h = 1;
53 input_dims.w = 1;
54 input_dims.c = accum_depth;
55
56 cmsis_nn_dims filter_dims;
57 filter_dims.n = accum_depth;
58 filter_dims.h = 1;
59 filter_dims.w = 1;
60 filter_dims.c = output_depth;
61
62 cmsis_nn_dims bias_dims;
63 bias_dims.n = 1;
64 bias_dims.h = 1;
65 bias_dims.w = 1;
66 bias_dims.c = output_depth;
67
68 cmsis_nn_dims output_dims;
69 output_dims.n = batches;
70 output_dims.h = 1;
71 output_dims.w = 1;
72 output_dims.c = output_depth;
73
74 int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
75 auto buffer = std::make_unique<int8_t[]>(buf_size);
76 assert(buffer != nullptr);
77
78 cmsis_nn_context ctx;
79 ctx.buf = buffer.get();
80 ctx.size = buf_size;
81
82 auto res =
83 arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
84 filter_data, &bias_dims, bias_data, &output_dims, output_data);
85 assert(res == ARM_CMSIS_NN_SUCCESS);
86}

References flatSizeSkipDim(), luci_interpreter_pal::FullyConnectedParams::input_offset, luci_interpreter_pal::FullyConnectedParams::output_multiplier, luci_interpreter_pal::FullyConnectedParams::output_offset, output_shape, luci_interpreter_pal::FullyConnectedParams::output_shift, luci_interpreter_pal::FullyConnectedParams::quantized_activation_max, luci_interpreter_pal::FullyConnectedParams::quantized_activation_min, and luci_interpreter_pal::FullyConnectedParams::weights_offset.

◆ FullyConnected< int8_t >() [2/4]

template<>
void luci_interpreter_pal::FullyConnected< int8_t > ( const tflite::FullyConnectedParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data 
)
inline

Definition at line 49 of file PALFullyConnected.h.

55{
56 assert(output_shape.DimensionsCount() == 2);
57
58 const int batches = output_shape.Dims(0);
59 const int output_depth = output_shape.Dims(1);
60
61 const int filter_dim_count = filter_shape.DimensionsCount();
62 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
63
64 cmsis_nn_fc_params fc_params;
65 fc_params.input_offset = params.input_offset;
66 fc_params.output_offset = params.output_offset;
67 fc_params.filter_offset = params.weights_offset;
68 fc_params.activation.min = params.quantized_activation_min;
69 fc_params.activation.max = params.quantized_activation_max;
70
71 cmsis_nn_per_tensor_quant_params quant_params;
72 quant_params.multiplier = params.output_multiplier;
73 quant_params.shift = params.output_shift;
74
75 cmsis_nn_dims input_dims;
76 input_dims.n = batches;
77 input_dims.h = 1;
78 input_dims.w = 1;
79 input_dims.c = accum_depth;
80
81 cmsis_nn_dims filter_dims;
82 filter_dims.n = accum_depth;
83 filter_dims.h = 1;
84 filter_dims.w = 1;
85 filter_dims.c = output_depth;
86
87 cmsis_nn_dims bias_dims;
88 bias_dims.n = 1;
89 bias_dims.h = 1;
90 bias_dims.w = 1;
91 bias_dims.c = output_depth;
92
93 cmsis_nn_dims output_dims;
94 output_dims.n = batches;
95 output_dims.h = 1;
96 output_dims.w = 1;
97 output_dims.c = output_depth;
98
99 int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&filter_dims);
100 auto buffer = std::make_unique<int8_t[]>(buf_size);
101 assert(buffer != nullptr);
102
103 cmsis_nn_context ctx;
104 ctx.buf = buffer.get();
105 ctx.size = buf_size;
106
107 auto res =
108 arm_fully_connected_s8(&ctx, &fc_params, &quant_params, &input_dims, input_data, &filter_dims,
109 filter_data, &bias_dims, bias_data, &output_dims, output_data);
110 assert(res == ARM_MATH_SUCCESS);
111}

References output_shape.

◆ FullyConnected< int8_t >() [3/4]

template<>
void luci_interpreter_pal::FullyConnected< int8_t > ( const tflite::FullyConnectedParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data 
)
inline

Definition at line 48 of file PALFullyConnected.h.

54{
55 tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
56 filter_data, bias_shape, bias_data, output_shape,
57 output_data);
58}

References output_shape.

◆ FullyConnected< int8_t >() [4/4]

template<>
void luci_interpreter_pal::FullyConnected< int8_t > ( const tflite::FullyConnectedParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  filter_shape,
const int8_t *  filter_data,
const tflite::RuntimeShape &  bias_shape,
const int32_t *  bias_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data 
)
inline

Definition at line 48 of file PALFullyConnected.h.

54{
55 tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
56 filter_data, bias_shape, bias_data, output_shape,
57 output_data);
58}

References output_shape.

◆ GatherND()

template<typename ParamsT , typename IndicesT >
void luci_interpreter_pal::GatherND ( luci_interpreter::RuntimeShape  params_shape,
const ParamsT *  param_data,
luci_interpreter::RuntimeShape  indices_shape,
const IndicesT *  index_data,
ParamsT *  output_data 
)
inline

Definition at line 30 of file PALGatherND.h.

33{
34 const int indices_dims = indices_shape.dimensionsCount();
35 const int indices_nd = indices_shape.dims(indices_dims - 1);
36 const int params_dims = params_shape.dimensionsCount();
37
38 int n_slices = 1;
39 for (int i = 0; i < indices_dims - 1; ++i)
40 {
41 n_slices *= indices_shape.dims(i);
42 }
43
44 // If indices[-1] == params.rank, fetch single elements.
45 // If indices[-1] < params.rank, fetch slices.
46 int slice_size = 1;
47 for (int i = indices_nd; i < params_dims; ++i)
48 {
49 slice_size *= params_shape.dims(i);
50 }
51
52 int params_flat_size = params_shape.flatSize();
53 int remain_flat_size = params_flat_size;
54
55 // Number of elements per dimension
56 int dims_to_count[MAX_INDICES_ND];
57 for (int i = 0; i < indices_nd; ++i)
58 {
59 dims_to_count[i] = remain_flat_size / params_shape.dims(i);
60 remain_flat_size = dims_to_count[i];
61 }
62
63 for (int i = 0; i < n_slices; ++i)
64 {
65 int from_pos = 0;
66 for (int j = 0; j < indices_nd; ++j)
67 {
68 int offset = i * indices_nd + j;
69 IndicesT index = index_data[offset];
70 from_pos += index * dims_to_count[j];
71 }
72 if (from_pos < 0 || from_pos + slice_size > params_flat_size)
73 {
74 assert(false && "GatherND error");
75 return;
76 }
77 std::memcpy(output_data + i * slice_size, param_data + from_pos, sizeof(ParamsT) * slice_size);
78 }
79}
constexpr int MAX_INDICES_ND
Definition PALGatherND.h:27
loco::GraphInputIndex index(const TFPlaceholder *node)
Definition TFNode.cpp:54

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), MAX_INDICES_ND, and offset().

◆ getActivationParams() [1/3]

template<typename P >
void luci_interpreter_pal::getActivationParams ( const P &  params,
float *  min,
float *  max 
)
inline

Definition at line 99 of file PALUtils.h.

100{
101 *min = params.float_activation_min;
102 *max = params.float_activation_max;
103}

◆ getActivationParams() [2/3]

template<typename P >
void luci_interpreter_pal::getActivationParams ( const P &  params,
int32_t *  min,
int32_t *  max 
)
inline

Definition at line 93 of file PALUtils.h.

94{
95 *min = params.quantized_activation_min;
96 *max = params.quantized_activation_max;
97}

Referenced by ArithmeticOp(), ArithmeticOpScalar(), and BroadcastArithmeticOp4DSlow().

◆ getActivationParams() [3/3]

template<typename P >
void luci_interpreter_pal::getActivationParams ( const P &  params,
int64_t *  min,
int64_t *  max 
)
inline

Definition at line 105 of file PALUtils.h.

106{
107 *min = params.int64_activation_min;
108 *max = params.int64_activation_max;
109}

◆ getNearestNeighbor()

int32_t luci_interpreter_pal::getNearestNeighbor ( const int  input_value,
const int32_t  input_size,
const int32_t  output_size,
const bool  align_corners,
const bool  half_pixel_centers 
)
inline

Definition at line 26 of file PALResizeNearestNeighbor.h.

29{
30 const float scale = (align_corners && output_size > 1)
31 ? (input_size - 1) / static_cast<float>(output_size - 1)
32 : input_size / static_cast<float>(output_size);
33 const float offset = half_pixel_centers ? 0.5f : 0.0f;
34 int32_t output_value =
35 std::min(align_corners ? static_cast<int32_t>(std::round((input_value + offset) * scale))
36 : static_cast<int32_t>(std::floor((input_value + offset) * scale)),
37 input_size - 1);
38 if (half_pixel_centers)
39 {
40 output_value = std::max(static_cast<int32_t>(0), output_value);
41 }
42 return output_value;
43}

References offset().

Referenced by ResizeNearestNeighbor().

◆ GreaterEqualFn()

template<typename T >
bool luci_interpreter_pal::GreaterEqualFn ( lhs,
rhs 
)
inline

Definition at line 55 of file PALComparisons.h.

55{ return lhs >= rhs; }

Referenced by luci_interpreter::execute_kernel_CircleGreaterEqual().

◆ GreaterFn()

template<typename T >
bool luci_interpreter_pal::GreaterFn ( lhs,
rhs 
)
inline

Definition at line 54 of file PALComparisons.h.

54{ return lhs > rhs; }

Referenced by luci_interpreter::execute_kernel_CircleGreater().

◆ GRU()

void luci_interpreter_pal::GRU ( const float *  input_data,
const float *  weight_input_data,
const float *  weight_hidden_data,
const float *  bias_input_data,
const float *  bias_hidden_data,
const float *  hidden_state_data,
float *  output_data,
float *  output_input_data,
float *  output_hidden_data,
const tflite::RuntimeShape &  input_shape,
const tflite::RuntimeShape &  output_shape,
const tflite::RuntimeShape &  weight_input_shape,
const tflite::RuntimeShape &  weight_hidden_shape 
)

Definition at line 147 of file PALGRU.h.

153{
154 const int32_t time = input_shape.Dims(0);
155
156 tflite::RuntimeShape output_shape_fc(2);
157 output_shape_fc.SetDim(0, 1);
158 output_shape_fc.SetDim(1, weight_hidden_shape.Dims(0));
159
160 std::memcpy(output_data, hidden_state_data, output_shape.FlatSize() * sizeof(float));
161
162 for (int i = 0; i < time; ++i)
163 {
164 calculateGRU(input_data, weight_input_data, weight_hidden_data, bias_input_data,
165 bias_hidden_data, output_data, input_shape, output_shape, weight_input_shape,
166 weight_hidden_shape, output_input_data, output_hidden_data, output_shape_fc);
167 input_data += input_shape.Dims(2);
168 }
169}
void calculateGRU(const float *input_data, const float *weight_input_data, const float *weight_hidden_data, const float *bias_input_data, const float *bias_hidden_data, float *output_data, const tflite::RuntimeShape &input_shape, const tflite::RuntimeShape &output_shape, const tflite::RuntimeShape &weight_input_shape, const tflite::RuntimeShape &weight_hidden_shape, float *output_input_data, float *output_hidden_data, const tflite::RuntimeShape &output_shape_fc)
Definition PALGRU.h:59

References calculateGRU(), and output_shape.

◆ L2Normalization()

void luci_interpreter_pal::L2Normalization ( const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data,
float  epsilon = 1e-6 
)
inline

Definition at line 27 of file PALL2Normalize.h.

31{
32 const int trailing_dim = input_shape.dimensionsCount() - 1;
33 const int outer_size =
34 flatSizeSkipDim(input_shape.dimsData(), trailing_dim, input_shape.dimensionsCount());
35 const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
36 for (int i = 0; i < outer_size; ++i)
37 {
38 float squared_l2_norm = 0;
39 for (int c = 0; c < depth; ++c)
40 {
41 const float val = input_data[depth * i + c];
42 squared_l2_norm += val * val;
43 }
44 float l2_norm = std::sqrt(squared_l2_norm);
45 l2_norm = std::fmax(l2_norm, epsilon);
46 for (int c = 0; c < depth; ++c)
47 {
48 output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
49 }
50 }
51}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), flatSizeSkipDim(), L2Normalization(), MatchingDim(), and output_shape.

Referenced by L2Normalization().

◆ L2Pool()

void luci_interpreter_pal::L2Pool ( const PoolParams params,
const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 27 of file PALL2Pool2D.h.

30{
31 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
32 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
33 const int input_height = input_shape.dims(1);
34 const int input_width = input_shape.dims(2);
35 const int output_height = output_shape.dims(1);
36 const int output_width = output_shape.dims(2);
37 const int stride_height = params.stride_height;
38 const int stride_width = params.stride_width;
39 for (int batch = 0; batch < batches; ++batch)
40 {
41 for (int out_y = 0; out_y < output_height; ++out_y)
42 {
43 for (int out_x = 0; out_x < output_width; ++out_x)
44 {
45 for (int channel = 0; channel < depth; ++channel)
46 {
47 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
48 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
49 // Compute the boundaries of the filter region clamped so as to
50 // ensure that the filter window fits in the input array.
51 const int filter_x_start = std::max(0, -in_x_origin);
52 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
53 const int filter_y_start = std::max(0, -in_y_origin);
54 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
55 float sum_squares = 0.f;
56 int filter_count = 0;
57 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
58 {
59 for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
60 {
61 const int in_x = in_x_origin + filter_x;
62 const int in_y = in_y_origin + filter_y;
63 const float val =
64 input_data[offset(input_shape.dimsData(), batch, in_y, in_x, channel)];
65 sum_squares += val * val;
66 filter_count++;
67 }
68 }
69 assert(filter_count != 0);
70 const float l2pool_result = std::sqrt(sum_squares / filter_count);
71 output_data[offset(output_shape.dimsData(), batch, out_y, out_x, channel)] =
74 }
75 }
76 }
77 }
78}
T activationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition PALUtils.h:204

References activationFunctionWithMinMax(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter_pal::PoolParams::filter_height, luci_interpreter_pal::PoolParams::filter_width, luci_interpreter_pal::PoolParams::float_activation_max, luci_interpreter_pal::PoolParams::float_activation_min, luci_interpreter_pal::PaddingValues::height, L2Pool(), MatchingDim(), offset(), output_shape, luci_interpreter_pal::PoolParams::padding_values, luci_interpreter_pal::PoolParams::stride_height, luci_interpreter_pal::PoolParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

Referenced by L2Pool().

◆ LessEqualFn()

template<typename T >
bool luci_interpreter_pal::LessEqualFn ( lhs,
rhs 
)
inline

Definition at line 52 of file PALComparisons.h.

52{ return lhs <= rhs; }

Referenced by luci_interpreter::execute_kernel_CircleLessEqual().

◆ LessFn()

template<typename T >
bool luci_interpreter_pal::LessFn ( lhs,
rhs 
)
inline

Definition at line 51 of file PALComparisons.h.

51{ return lhs < rhs; }

Referenced by luci_interpreter::execute_kernel_CircleLess().

◆ Log()

void luci_interpreter_pal::Log ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 26 of file PALLog.h.

27{
28 for (int i = 0; i < flat_size; i++)
29 {
30 const float val = input_data[i];
31 const float result = std::log(val);
32 output_data[i] = result;
33 }
34}

Referenced by luci_interpreter::execute_kernel_CircleLog().

◆ LogicalCommon()

void luci_interpreter_pal::LogicalCommon ( const int  flat_size,
const bool *  input1_data,
const bool *  input2_data,
bool *  output_data,
bool(*)(bool, bool)  f 
)
inline

Definition at line 24 of file PALLogicalCommon.h.

26{
27 for (int i = 0; i < flat_size; ++i)
28 {
29 output_data[i] = f(input1_data[i], input2_data[i]);
30 }
31}

Referenced by luci_interpreter::execute_kernel_CircleLogicalAnd(), and luci_interpreter::execute_kernel_CircleLogicalOr().

◆ LogicalNot()

void luci_interpreter_pal::LogicalNot ( const int  flat_size,
const bool *  input_data,
bool *  output_data 
)
inline

Definition at line 24 of file PALLogicalNotCommon.h.

25{
26 for (int i = 0; i < flat_size; ++i)
27 {
28 output_data[i] = !input_data[i];
29 }
30}

Referenced by luci_interpreter::execute_kernel_CircleLogicalNot().

◆ Logistic() [1/3]

void luci_interpreter_pal::Logistic ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 26 of file PALGRU.h.

27{
28 const float cutoff_upper = 16.619047164916992188f;
29 const float cutoff_lower = -9.f;
30
31 // Rational for using approximation in reference kernel.
32 // 0. This approximation gives enough precision for float.
33 // 1. This works around an issue on an embedded chipset where exp() does not
34 // return correctly as expected - exp(x) should return inf when overflown
35 // not 1.701417 IEEE 754 defines representation for inf.
36 // 2. This will speed up calculation and is matching the behavior in the
37 // optimized kernels. (check the definition of scalar_logistic_op<float>)
38
39 for (int i = 0; i < flat_size; i++)
40 {
41 float val = input_data[i];
42 float result;
43 if (val > cutoff_upper)
44 {
45 result = 1.0f;
46 }
47 else if (val < cutoff_lower)
48 {
49 result = std::exp(val);
50 }
51 else
52 {
53 result = 1.f / (1.f + std::exp(-val));
54 }
56 }
57}
result
Definition infer.py:103

Referenced by calculateGRU(), luci_interpreter::execute_kernel_CircleLogistic(), luci_interpreter_pal::lstm_internal::sigmoid(), and luci_interpreter_pal::lstm_internal::sigmoid().

◆ Logistic() [2/3]

void luci_interpreter_pal::Logistic ( const int  flat_size,
const int8_t *  input_data,
float  input_scale,
int  input_zero_point,
int8_t *  output_data,
float  output_scale,
int  output_zero_point 
)
inline

Definition at line 60 of file PALLogistic.h.

63{
64 const float cutoff_upper = 16.619047164916992188f;
65 const float cutoff_lower = -9.f;
66
67 // Rational for using approximation in reference kernel.
68 // 0. This approximation gives enough precision for float.
69 // 1. This works around an issue on an embedded chipset where exp() does not
70 // return correctly as expected - exp(x) should return inf when overflown
71 // not 1.701417 IEEE 754 defines representation for inf.
72 // 2. This will speed up calculation and is matching the behavior in the
73 // optimized kernels. (check the definition of scalar_logistic_op<float>)
74
75 for (int i = 0; i < flat_size; i++)
76 {
77 // Dequantize.
78 float val = static_cast<float>((input_data[i] - input_zero_point) * input_scale);
79 float result;
80 if (val > cutoff_upper)
81 {
82 result = 1.0f;
83 }
84 else if (val < cutoff_lower)
85 {
86 result = std::exp(val);
87 }
88 else
89 {
90 result = 1.f / (1.f + std::exp(-val));
91 }
92 // Requantize
93 int8_t output = static_cast<int8_t>(result / output_scale + output_zero_point);
95 }
96}

◆ Logistic() [3/3]

void luci_interpreter_pal::Logistic ( int32_t  input_multiplier,
int32_t  input_left_shift,
int32_t  input_size,
const int16_t *  ptr_input_data,
int16_t *  ptr_output_data 
)
inline

Definition at line 98 of file PALLogistic.h.

100{
101 // We use the LUT for sigmoid and take into account, that
102 // tanh(x) = 2*sigmoid(2*x) - 1
103
104 // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
105 // In case of general parameter scale, multiplier 3 is taken into account
106 // in TanhPrepare function and it is included in
107 // input_multiplier already.
108 if (input_multiplier == 0)
109 { // power of two case
110 input_multiplier = 3 << input_left_shift;
111 input_left_shift = 0;
112 }
113
114 int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
115
116 for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++)
117 {
118 int32_t input_data = ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
119
120 // We do interpolation on unsigned values.
121 uint32_t abs_input_data = abs(input_data);
122
123 // We divide by 2 power of 9, because
124 // we need to divide by 2 in power of 7 for
125 // the input conversion + 1/4 from the scale above.
126
127 // Define uh as uint32_t type not to make this function overflow.
128 uint32_t uh = abs_input_data >> 9;
129 uint32_t result;
130
131 if (uh >= 255)
132 {
133 // Saturate to maximum.
134 result = 0x7FFF << 10;
135 }
136 else
137 {
138 uint32_t ua = sigmoid_table_uint16[uh];
139 uint32_t ub = sigmoid_table_uint16[uh + 1];
140 uint32_t ut = abs_input_data & 0x1ff;
141 // Interpolation is done using the fractional bit.
142 result = (ua << 9) + ut * (ub - ua);
143 }
144
145 result = (input_data >= 0) ? (result + (1 << 9)) : ((1 << (16 + 9)) - result + (1 << 9) - 1);
146
147 // Back to 16-bit.
148 result >>= 10;
149
150 *ptr_output_data = result;
151 }
152}

◆ LogSoftmax()

void luci_interpreter_pal::LogSoftmax ( const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 28 of file PALLogSoftmax.h.

30{
31 const int trailing_dim = input_shape.dimensionsCount() - 1;
32 const int outer_size =
33 flatSizeSkipDim(input_shape.dimsData(), trailing_dim, input_shape.dimensionsCount());
34 const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
35
36 for (int i = 0; i < outer_size; ++i)
37 {
38 // Find max element value which we'll use to ensure numerical stability
39 // taking advantage of the following equality:
40 // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
41 float max = std::numeric_limits<float>::lowest();
42 for (int c = 0; c < depth; ++c)
43 {
44 max = std::max(max, input_data[i * depth + c]);
45 }
46
47 // Compute sum.
48 float sum = 0.f;
49 for (int c = 0; c < depth; ++c)
50 {
51 sum += std::exp(input_data[i * depth + c] - max);
52 }
53
54 // Compute result.
55 const float log_sum = std::log(sum);
56 for (int c = 0; c < depth; ++c)
57 {
58 output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
59 }
60 }
61}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), flatSizeSkipDim(), MatchingDim(), and output_shape.

◆ MatchingDim()

int luci_interpreter_pal::MatchingDim ( const luci_interpreter::RuntimeShape shape1,
int  index1,
const luci_interpreter::RuntimeShape shape2,
int  index2 
)
inline

Definition at line 173 of file PALUtils.h.

175{
176 assert(shape1.dims(index1) == shape2.dims(index2));
177 return shape1.dims(index1);
178}

References luci_interpreter::RuntimeShape::dims().

Referenced by L2Normalization(), L2Pool(), LogSoftmax(), and ResizeNearestNeighbor().

◆ Maximum()

void luci_interpreter_pal::Maximum ( const int  flat_size,
const float *  input1_data,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 25 of file PALMaximumCommon.h.

27{
28 BinaryOp<float, MaximumFn<float>>(flat_size, input1_data, input2_data, output_data);
29}

Referenced by luci_interpreter::execute_kernel_CircleMaximum().

◆ MaxPool() [1/2]

void luci_interpreter_pal::MaxPool ( const PoolParams params,
const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 27 of file PALMaxPool2DCommon.h.

30{
31 const int batches = input_shape.dims(0);
32 const int depth = output_shape.dims(3);
33 const int input_height = input_shape.dims(1);
34 const int input_width = input_shape.dims(2);
35 const int output_height = output_shape.dims(1);
36 const int output_width = output_shape.dims(2);
37 const int stride_height = params.stride_height;
38 const int stride_width = params.stride_width;
39 for (int batch = 0; batch < batches; ++batch)
40 {
41 for (int out_y = 0; out_y < output_height; ++out_y)
42 {
43 for (int out_x = 0; out_x < output_width; ++out_x)
44 {
45 for (int channel = 0; channel < depth; ++channel)
46 {
47 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
48 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
49 // Compute the boundaries of the filter region clamped so as to
50 // ensure that the filter window fits in the input array.
51 const int filter_x_start = std::max(0, -in_x_origin);
52 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
53 const int filter_y_start = std::max(0, -in_y_origin);
54 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
55 float max = std::numeric_limits<float>::lowest();
56 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
57 {
58 for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
59 {
60 const int in_x = in_x_origin + filter_x;
61 const int in_y = in_y_origin + filter_y;
62
63 const int input_data_offset =
64 ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
65 input_shape.dims(3) +
66 channel;
67
68 max = std::max(max, input_data[input_data_offset]);
69 }
70 }
71 const int output_data_offset =
72 ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
74 channel;
75
76 output_data[output_data_offset] =
77 std::min(std::max(max, params.float_activation_min), params.float_activation_max);
78 }
79 }
80 }
81 }
82}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::PoolParams::filter_height, luci_interpreter_pal::PoolParams::filter_width, luci_interpreter_pal::PoolParams::float_activation_max, luci_interpreter_pal::PoolParams::float_activation_min, luci_interpreter_pal::PaddingValues::height, output_shape, luci_interpreter_pal::PoolParams::padding_values, luci_interpreter_pal::PoolParams::stride_height, luci_interpreter_pal::PoolParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

◆ MaxPool() [2/2]

void luci_interpreter_pal::MaxPool ( const PoolParams params,
const luci_interpreter::RuntimeShape input_shape,
const uint8_t *  input_data,
const luci_interpreter::RuntimeShape output_shape,
uint8_t *  output_data,
luci_interpreter::DataType  data_type 
)
inline

Definition at line 28 of file PALMaxPool2D.h.

31{
32 cmsis_nn_dims input_dims;
33 cmsis_nn_dims output_dims;
34 cmsis_nn_pool_params pool_params;
35 cmsis_nn_dims filter_dims;
36 cmsis_nn_context ctx;
37
38 const int depth = input_shape.dims(3);
39 const int output_width = output_shape.dims(2);
40
41 input_dims.n = 1;
42 input_dims.h = input_shape.dims(1);
43 input_dims.w = input_shape.dims(2);
44 input_dims.c = depth;
45
46 output_dims.n = 1;
47 output_dims.h = output_shape.dims(1);
48 output_dims.w = output_width;
49 output_dims.c = depth;
50
51 pool_params.stride.h = params.stride_height;
52 pool_params.stride.w = params.stride_width;
53 pool_params.padding.h = params.padding_values.height;
54 pool_params.padding.w = params.padding_values.width;
55 pool_params.activation.min = params.quantized_activation_min;
56 pool_params.activation.max = params.quantized_activation_max;
57
58 filter_dims.n = 1;
59 filter_dims.h = params.filter_height;
60 filter_dims.w = params.filter_width;
61 filter_dims.c = 1;
62
63 if (data_type == luci_interpreter::DataType::S8)
64 {
65 arm_max_pool_s8(&ctx, &pool_params, &input_dims,
66 luci_interpreter::kernels::getTensorData<int8_t>(input_data), &filter_dims,
67 &output_dims, luci_interpreter::kernels::getTensorData<int8_t>(output_data));
68 }
69 else
70 {
71 arm_max_pool_s16(&ctx, &pool_params, &input_dims,
72 luci_interpreter::kernels::getTensorData<int16_t>(input_data), &filter_dims,
73 &output_dims, luci_interpreter::kernels::getTensorData<int16_t>(output_data));
74 }
75}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter_pal::PoolParams::filter_height, luci_interpreter_pal::PoolParams::filter_width, luci_interpreter_pal::PaddingValues::height, output_shape, luci_interpreter_pal::PoolParams::padding_values, luci_interpreter_pal::PoolParams::quantized_activation_max, luci_interpreter_pal::PoolParams::quantized_activation_min, luci_interpreter_pal::PoolParams::stride_height, luci_interpreter_pal::PoolParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

Referenced by luci_interpreter::execute_kernel_CircleMaxPool2D().

◆ Mean() [1/2]

void luci_interpreter_pal::Mean ( const MeanParams op_params,
const luci_interpreter::RuntimeShape unextended_input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
float *  output_data 
)
inline

Definition at line 167 of file PALMean.h.

171{
172 // Current implementation only supports dimension equals 4 and simultaneous
173 // reduction over width and height.
174 const luci_interpreter::RuntimeShape input_shape =
175 luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
177 luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
178
179 const int output_batch = output_shape.dims(0);
180 const int output_depth = output_shape.dims(3);
181
182 const int input_height = input_shape.dims(1);
183 const int input_width = input_shape.dims(2);
184
185 for (int out_b = 0; out_b < output_batch; ++out_b)
186 {
187 for (int out_d = 0; out_d < output_depth; ++out_d)
188 {
189 float value = 0;
190 for (int in_h = 0; in_h < input_height; ++in_h)
191 {
192 for (int in_w = 0; in_w < input_width; ++in_w)
193 {
194 value += input_data[offset(input_shape.dimsData(), out_b, in_h, in_w, out_d)];
195 }
196 }
197 output_data[offset(output_shape.dimsData(), out_b, 0, 0, out_d)] =
198 value / (input_width * input_height);
199 }
200 }
201}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), and output_shape.

◆ Mean() [2/2]

template<typename T , typename U >
bool luci_interpreter_pal::Mean ( const T *  input_data,
const int *  input_dims,
const int  input_num_dims,
T *  output_data,
const int *  output_dims,
const int  output_num_dims,
const int *  axis,
const int  num_axis_dimensions,
bool  ,
int *  temp_index,
int *  resolved_axis,
U *  temp_sum 
)
inline

Definition at line 108 of file PALMean.h.

112{
113 // Reset output data.
114 size_t num_outputs = 1;
115 for (int idx = 0; idx < output_num_dims; ++idx)
116 {
117 size_t current = static_cast<size_t>(output_dims[idx]);
118 // Overflow prevention.
119 if (num_outputs > std::numeric_limits<size_t>::max() / current)
120 {
121 return false;
122 }
123 num_outputs *= current;
124 }
125 for (size_t idx = 0; idx < num_outputs; ++idx)
126 {
127 output_data[idx] = T();
128 temp_sum[idx] = U();
129 }
130
131 // Resolve axis.
132 int num_resolved_axis = 0;
133 if (!resolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis, &num_resolved_axis))
134 {
135 return false;
136 }
137
138 if (!reduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims, output_num_dims,
139 resolved_axis, num_resolved_axis, temp_index, temp_sum))
140 {
141 return false;
142 }
143
144 // Calculate mean by dividing output_data by num of aggregated element.
145 size_t num_elements_in_axis = 1;
146 for (int idx = 0; idx < num_resolved_axis; ++idx)
147 {
148 size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
149 // Overflow prevention.
150 if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis))
151 {
152 return false;
153 }
154 num_elements_in_axis *= current;
155 }
156
157 if (num_elements_in_axis > 0)
158 {
159 for (size_t idx = 0; idx < num_outputs; ++idx)
160 {
161 output_data[idx] = static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
162 }
163 }
164 return true;
165}

Referenced by luci_interpreter::execute_kernel_CircleMean().

◆ Minimum()

void luci_interpreter_pal::Minimum ( const int  flat_size,
const float *  input1_data,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 25 of file PALMinimumCommon.h.

27{
28 BinaryOp<float, MinimumFn<float>>(flat_size, input1_data, input2_data, output_data);
29}

Referenced by luci_interpreter::execute_kernel_CircleMinimum().

◆ MirrorPad()

template<typename T >
void luci_interpreter_pal::MirrorPad ( const luci_interpreter::DataType  padding_matrix_type,
const uint8_t *  padding_matrix_data,
const int32_t *  input_dims,
int *  output_dims_num_elements,
int *  input_dims_num_elements,
const T *  input_data,
T *  output_data,
const int  offset,
const int  num_dims,
const int  output_size 
)

Definition at line 95 of file PALMirrorPad.h.

99{
100 for (int i = 0; i < output_size; ++i)
101 {
102 output_data[i] =
103 input_data[getFlatIndex(i, num_dims, padding_matrix_type, padding_matrix_data, input_dims,
104 output_dims_num_elements, input_dims_num_elements, offset)];
105 }
106}

References offset().

Referenced by luci_interpreter::execute_kernel_CircleMirrorPad().

◆ Mul() [1/2]

template<typename T >
void luci_interpreter_pal::Mul ( const ArithmeticParams params,
const int  flat_size,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 26 of file PALMulCommon.h.

28{
29 ArithmeticOp<T, MulFn<T>>(params, flat_size, input1_data, input2_data, output_data);
30}

◆ Mul() [2/2]

template<>
void luci_interpreter_pal::Mul ( tflite::ArithmeticParams &  params,
const tflite::RuntimeShape &  input1_shape,
const int64_t *  input1_data,
const tflite::RuntimeShape &  input2_shape,
const int64_t *  input2_data,
const tflite::RuntimeShape &  output_shape,
int64_t *  output_data 
)
inline

Definition at line 35 of file PALMul.h.

39{
40 tflite::optimized_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
41 input2_data, output_shape, output_data);
42}

References output_shape.

◆ Mul< int16_t >() [1/2]

template<>
void luci_interpreter_pal::Mul< int16_t > ( const ArithmeticParams ,
const int  ,
const int16_t *  ,
const int16_t *  ,
int16_t *   
)
inline

Definition at line 34 of file PALMul.h.

36{
37 assert(false && "Not IMPL yet");
38}

◆ Mul< int16_t >() [2/2]

template<>
void luci_interpreter_pal::Mul< int16_t > ( const ArithmeticParams params,
const int  flat_size,
const int16_t *  input1_data,
const int16_t *  input2_data,
int16_t *  output_data 
)
inline

◆ Mul< int8_t >() [1/2]

template<>
void luci_interpreter_pal::Mul< int8_t > ( const ArithmeticParams ,
const int  ,
const int8_t *  ,
const int8_t *  ,
int8_t *   
)
inline

Definition at line 27 of file PALMul.h.

29{
30 assert(false && "Not IMPL yet");
31}

◆ Mul< int8_t >() [2/2]

template<>
void luci_interpreter_pal::Mul< int8_t > ( const ArithmeticParams params,
const int  flat_size,
const int8_t *  input1_data,
const int8_t *  input2_data,
int8_t *  output_data 
)
inline

◆ MulScalar()

template<typename T >
void luci_interpreter_pal::MulScalar ( const ArithmeticParams params,
const int  flat_size,
const T *  input_data,
const T  scalar_value,
T *  output_data 
)
inline

Definition at line 33 of file PALMulCommon.h.

35{
36 ArithmeticOpScalar<T, MulFn<T>>(params, flat_size, input_data, scalar_value, output_data);
37}

◆ multiplyByQuantizedMultiplier()

int32_t luci_interpreter_pal::multiplyByQuantizedMultiplier ( int32_t  x,
int32_t  quantized_multiplier,
int  shift 
)
inline

Definition at line 77 of file PALUtils.h.

78{
79 int left_shift = shift > 0 ? shift : 0;
80 int right_shift = shift > 0 ? 0 : -shift;
82 saturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
83}
std::int32_t saturatingRoundingDoublingHighMul(std::int32_t a, std::int32_t b)
Definition PALUtils.h:52
int32_t roundingDivideByPOT(int32_t x, int32_t exponent)
Definition PALUtils.h:65

References roundingDivideByPOT(), and saturatingRoundingDoublingHighMul().

Referenced by FullyConnected(), and luci_interpreter_pal::lstm_internal::mulElementwise().

◆ multiplyByQuantizedMultiplierSmallerThanOneExp()

int32_t luci_interpreter_pal::multiplyByQuantizedMultiplierSmallerThanOneExp ( int32_t  x,
int32_t  quantized_multiplier,
int  left_shift 
)
inline

Definition at line 85 of file PALUtils.h.

88{
89 return roundingDivideByPOT(saturatingRoundingDoublingHighMul(x, quantized_multiplier),
90 -left_shift);
91}

References roundingDivideByPOT(), and saturatingRoundingDoublingHighMul().

Referenced by BroadcastComparison4DSlowWithScaling(), and ComparisonWithScaling().

◆ NdArrayDescsForElementwiseBroadcast()

template<int N>
void luci_interpreter_pal::NdArrayDescsForElementwiseBroadcast ( const luci_interpreter::RuntimeShape input0_shape,
const luci_interpreter::RuntimeShape input1_shape,
NdArrayDesc< N > *  desc0_out,
NdArrayDesc< N > *  desc1_out 
)
inline

Definition at line 89 of file ProcessBroadcastShapes.h.

93{
94
95 auto extended_input0_shape = luci_interpreter::RuntimeShape::extendedShape(N, input0_shape);
96 auto extended_input1_shape = luci_interpreter::RuntimeShape::extendedShape(N, input1_shape);
97
98 // Copy dims to desc, calculating strides.
99 copyDimsToDesc<N>(extended_input0_shape, desc0_out);
100 copyDimsToDesc<N>(extended_input1_shape, desc1_out);
101
102 // Walk over each dimension. If the extents are equal do nothing.
103 // Otherwise, set the desc with extent 1 to have extent equal to the other and
104 // stride 0.
105 for (int i = 0; i < N; ++i)
106 {
107 const int extent0 = extended_input0_shape.dims(i);
108 const int extent1 = extended_input1_shape.dims(i);
109 if (extent0 != extent1)
110 {
111 if (extent0 == 1)
112 {
113 desc0_out->strides[i] = 0;
114 desc0_out->extents[i] = extent1;
115 }
116 else
117 {
118 desc1_out->strides[i] = 0;
119 desc1_out->extents[i] = extent0;
120 }
121 }
122 }
123}

References luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, and luci_interpreter_pal::NdArrayDesc< N >::strides.

Referenced by BroadcastArithmeticOp4DSlow(), BroadcastBinaryOp4DSlow(), BroadcastPrelu4DSlowFloat(), and BroadcastTISO4DSlow().

◆ NDOpsHelper()

template<int N, typename Calc >
void luci_interpreter_pal::NDOpsHelper ( const NdArrayDesc< N > &  output,
const Calc &  calc 
)
inline

Definition at line 82 of file ProcessBroadcastShapes.h.

83{
84 int indexes[N] = {0};
85 NDOpsHelperImpl<N, 0, Calc>(output, calc, indexes);
86}

◆ NDOpsHelperImpl() [1/2]

template<int N, int DIM, typename Calc >
std::enable_if< DIM==N-1, void >::type luci_interpreter_pal::NDOpsHelperImpl ( const NdArrayDesc< N > &  output,
const Calc &  calc,
int  indexes[N] 
)

Definition at line 60 of file ProcessBroadcastShapes.h.

62{
63 for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM])
64 {
65 calc(indexes);
66 }
67}

◆ NDOpsHelperImpl() [2/2]

template<int N, int DIM, typename Calc >
std::enable_if< DIM!=N-1, void >::type luci_interpreter_pal::NDOpsHelperImpl ( const NdArrayDesc< N > &  output,
const Calc &  calc,
int  indexes[N] 
)

Definition at line 70 of file ProcessBroadcastShapes.h.

72{
73 for (indexes[DIM] = 0; indexes[DIM] < output.extents[DIM]; ++indexes[DIM])
74 {
75 NDOpsHelperImpl<N, DIM + 1, Calc>(output, calc, indexes);
76 }
77}

◆ Negate()

template<typename T >
void luci_interpreter_pal::Negate ( const luci_interpreter::RuntimeShape input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)
inline

Definition at line 23 of file PALNeg.h.

26{
27 // check that input and output dimensions are equal
28 int N = input_shape.dimensionsCount();
29 assert(N == output_shape.dimensionsCount());
30
31 // check that sizes of all dimensions are equal
32 for (int i = 0; i < N; ++i)
33 {
34 assert(input_shape.dims(i) == output_shape.dims(i));
35 }
36
37 const int flat_size = input_shape.flatSize();
38
39 for (int i = 0; i < flat_size; ++i)
40 {
41 output_data[i] = -input_data[i];
42 }
43}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::flatSize(), Negate(), and output_shape.

Referenced by Negate().

◆ nextIndex()

bool luci_interpreter_pal::nextIndex ( const int  num_dims,
const int *  dims,
int *  current 
)
inline

Definition at line 148 of file PALUtils.h.

149{
150 if (num_dims == 0)
151 {
152 return false;
153 }
154 int carry = 1;
155 for (int idx = num_dims - 1; idx >= 0; --idx)
156 {
157 int current_val = current[idx] + carry;
158 if (dims[idx] == current_val)
159 {
160 current[idx] = 0;
161 }
162 else
163 {
164 current[idx] = current_val;
165 carry = 0;
166 break;
167 }
168 }
169 return (carry == 0);
170}

Referenced by ReduceGeneric().

◆ NotEqualFn()

template<typename T >
bool luci_interpreter_pal::NotEqualFn ( lhs,
rhs 
)
inline

Definition at line 56 of file PALComparisons.h.

56{ return lhs != rhs; }

Referenced by luci_interpreter::execute_kernel_CircleNotEqual().

◆ offset() [1/2]

int luci_interpreter_pal::offset ( const int32_t *  dims_data,
int  i0,
int  i1,
int  i2,
int  i3 
)
inline

Definition at line 193 of file PALUtils.h.

194{
195 return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
196}

Referenced by BatchToSpaceND(), BroadcastPrelu4DSlowFloat(), DepthToSpace(), Floor(), GatherND(), getNearestNeighbor(), L2Pool(), Mean(), MirrorPad(), reducedOutputOffset(), SpaceToBatchND(), SpaceToDepth(), and TransposeConv().

◆ offset() [2/2]

int luci_interpreter_pal::offset ( const int32_t *  dims_data,
int  i0,
int  i1,
int  i2,
int  i3,
int  i4 
)
inline

Definition at line 198 of file PALUtils.h.

199{
200 return (((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3) * dims_data[4] + i4;
201}

◆ Offset()

int luci_interpreter_pal::Offset ( const luci_interpreter::RuntimeShape shape,
int  i0,
int  i1,
int  i2,
int  i3 
)
inline

Definition at line 27 of file PALResizeBilinear.h.

28{
29 assert(shape.dimensionsCount() == 4);
30
31 const int32_t *dims_data = reinterpret_cast<const int32_t *>(shape.dimsData());
32 LUCI_INTERPRETER_CHECK(i0 >= 0 && i0 < dims_data[0]);
33 LUCI_INTERPRETER_CHECK(i1 >= 0 && i1 < dims_data[1]);
34 LUCI_INTERPRETER_CHECK(i2 >= 0 && i2 < dims_data[2]);
35 LUCI_INTERPRETER_CHECK(i3 >= 0 && i3 < dims_data[3]);
36 return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
37}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dimsData(), and LUCI_INTERPRETER_CHECK.

◆ Pad()

void luci_interpreter_pal::Pad ( const PadParams op_params,
const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const float *  pad_value_ptr,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)

Definition at line 28 of file PALPad.h.

31{
32 // Runtime calls are currently fixed at 5 dimensions. Copy inputs so we can
33 // pad them to 5 dims (yes, we are "padding the padding").
34 int left_padding_copy[PadKernelMaxDimensionCount()];
35 for (int i = 0; i < PadKernelMaxDimensionCount(); i++)
36 {
37 left_padding_copy[i] = 0;
38 }
39 for (int i = 0; i < op_params.left_padding_count; ++i)
40 {
41 left_padding_copy[i + PadKernelMaxDimensionCount() - op_params.left_padding_count] =
42 op_params.left_padding[i];
43 }
44 int right_padding_copy[PadKernelMaxDimensionCount()];
45 for (int i = 0; i < PadKernelMaxDimensionCount(); i++)
46 {
47 right_padding_copy[i] = 0;
48 }
49 for (int i = 0; i < op_params.right_padding_count; ++i)
50 {
51 right_padding_copy[i + PadKernelMaxDimensionCount() - op_params.right_padding_count] =
52 op_params.right_padding[i];
53 }
54 const auto extended_output =
56 const int output_batch = extended_output.dims(0);
57 const int output_plane = extended_output.dims(1);
58 const int output_height = extended_output.dims(2);
59 const int output_width = extended_output.dims(3);
60 const int output_depth = extended_output.dims(4);
61
62 const int left_b_padding = left_padding_copy[0];
63 const int left_p_padding = left_padding_copy[1];
64 const int left_h_padding = left_padding_copy[2];
65 const int left_w_padding = left_padding_copy[3];
66 const int left_d_padding = left_padding_copy[4];
67
68 const int right_b_padding = right_padding_copy[0];
69 const int right_p_padding = right_padding_copy[1];
70 const int right_h_padding = right_padding_copy[2];
71 const int right_w_padding = right_padding_copy[3];
72 const int right_d_padding = right_padding_copy[4];
73
74 const float pad_value = *pad_value_ptr;
75
76 const float *in_ptr = input_data;
77 float *out_ptr = output_data;
78 for (int out_b = 0; out_b < output_batch; ++out_b)
79 {
80 for (int out_p = 0; out_p < output_plane; ++out_p)
81 {
82 for (int out_h = 0; out_h < output_height; ++out_h)
83 {
84 for (int out_w = 0; out_w < output_width; ++out_w)
85 {
86 for (int out_d = 0; out_d < output_depth; ++out_d)
87 {
88 if (out_b < left_b_padding || out_b >= output_batch - right_b_padding ||
89 out_p < left_p_padding || out_p >= output_plane - right_p_padding ||
90 out_h < left_h_padding || out_h >= output_height - right_h_padding ||
91 out_w < left_w_padding || out_w >= output_width - right_w_padding ||
92 out_d < left_d_padding || out_d >= output_depth - right_d_padding)
93 {
94 *out_ptr++ = pad_value;
95 }
96 else
97 {
98 *out_ptr++ = *in_ptr++;
99 }
100 }
101 }
102 }
103 }
104 }
105}
constexpr int PadKernelMaxDimensionCount()
Definition PALPad.h:26

References luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::PadParams::left_padding, luci_interpreter_pal::PadParams::left_padding_count, output_shape, PadKernelMaxDimensionCount(), luci_interpreter_pal::PadParams::right_padding, and luci_interpreter_pal::PadParams::right_padding_count.

Referenced by luci_interpreter::execute_kernel_CirclePadCommon().

◆ PadKernelMaxDimensionCount()

constexpr int luci_interpreter_pal::PadKernelMaxDimensionCount ( )
constexpr

Definition at line 26 of file PALPad.h.

26{ return 5; }

Referenced by Pad().

◆ ProcessBroadcastShapes()

bool luci_interpreter_pal::ProcessBroadcastShapes ( const luci_interpreter::RuntimeShape shape0,
const luci_interpreter::RuntimeShape shape1,
luci_interpreter_pal::ArithmeticParams params 
)
inline

Definition at line 150 of file ProcessBroadcastShapes.h.

153{
154 const int dims_count = std::max(shape0.dimensionsCount(), shape1.dimensionsCount());
155
156 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
157
158 auto extended_shape0 = luci_interpreter::RuntimeShape::extendedShape(dims_count, shape0);
159 auto extended_shape1 = luci_interpreter::RuntimeShape::extendedShape(dims_count, shape1);
160
161 // Check for "exact" match, implicitly accepting any scalar shapes.
162 if (extended_shape0 == extended_shape1)
163 {
164 params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
165 return false;
166 }
167
168 if (shape0.flatSize() == 1)
169 {
170 params->broadcast_category = BroadcastableOpCategory::kScalarFirstBroadcast;
171 return true;
172 }
173 else if (shape1.flatSize() == 1)
174 {
175 params->broadcast_category = BroadcastableOpCategory::kScalarSecondBroadcast;
176 return true;
177 }
178
179 for (int i = dims_count - 1; i >= 0; --i)
180 {
181 if (extended_shape0.dims(i) == extended_shape1.dims(i))
182 {
183 continue;
184 }
185 else if (extended_shape0.dims(i) == 1)
186 {
187 params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast;
188 return true;
189 }
190 else if (extended_shape1.dims(i) == 1)
191 {
192 params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast;
193 return true;
194 }
195 else
196 {
197 // This case is erroneous: there is a dimension that does not match and
198 // is not a broadcast from one shape to the other.
199 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
200 return true;
201 }
202 }
203
204 return false;
205}
BroadcastableOpCategory broadcast_category
Definition Params.h:180

References luci_interpreter_pal::ArithmeticParams::broadcast_category, luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter::RuntimeShape::flatSize(), kFirstInputBroadcastsFast, kGenericBroadcast, kNonBroadcast, kScalarFirstBroadcast, kScalarSecondBroadcast, and kSecondInputBroadcastsFast.

Referenced by luci_interpreter::kernels::evalTISOKernel(), and luci_interpreter::kernels::evalTISOQuantizedKernel().

◆ Quantize()

template<typename InputT , typename OutputT >
void luci_interpreter_pal::Quantize ( const QuantizationParams op_params,
const int  flat_size,
const InputT *  input_data,
OutputT *  output_data 
)
inline

Definition at line 27 of file PALQuantize.h.

29{
30 const int32_t zero_point = op_params.zero_point;
31 const double scale = op_params.scale;
32 static constexpr int32_t min_val = std::numeric_limits<OutputT>::min();
33 static constexpr int32_t max_val = std::numeric_limits<OutputT>::max();
34
35 for (int i = 0; i < flat_size; i++)
36 {
37 const InputT val = input_data[i];
38 int32_t unclamped =
39 static_cast<int32_t>(std::round(val / static_cast<float>(scale))) + zero_point;
40 int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
41 output_data[i] = clamped;
42 }
43}

References luci_interpreter_pal::QuantizationParams::scale, and luci_interpreter_pal::QuantizationParams::zero_point.

◆ reducedOutputOffset()

size_t luci_interpreter_pal::reducedOutputOffset ( const int  num_dims,
const int *  dims,
const int *  index,
const int  num_axis,
const int *  axis 
)
inline

Definition at line 116 of file PALUtils.h.

118{
119 if (num_dims == 0)
120 {
121 return 0;
122 }
123 size_t offset = 0;
124 for (int idx = 0; idx < num_dims; ++idx)
125 {
126 // if we need to skip this axis
127 bool is_axis = false;
128 if (axis != nullptr)
129 {
130 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
131 {
132 if (idx == axis[axis_idx])
133 {
134 is_axis = true;
135 break;
136 }
137 }
138 }
139 if (!is_axis)
140 {
141 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
142 }
143 }
144 return offset;
145}

References offset().

Referenced by ReduceGeneric().

◆ ReduceGeneric()

template<typename T >
void luci_interpreter_pal::ReduceGeneric ( const T *  input_data,
const int *  input_dims,
const int  input_num_dims,
T *  output_data,
const int *  axis,
const int64_t  num_axis_dimensions,
init_value,
const int  output_flat_size,
T   reducerconst T, const T 
)
inline

Definition at line 73 of file PALReduceCommon.h.

76{
77 // Return early when input shape has zero dim.
78 for (int i = 0; i < input_num_dims; ++i)
79 {
80 if (input_dims[i] == 0)
81 return;
82 }
83
84 for (size_t idx = 0; idx < output_flat_size; ++idx)
85 {
86 output_data[idx] = init_value;
87 }
88
89 // Resolve axis.
90 int num_resolved_axis = 0;
91 if (!resolveAxis(input_num_dims, axis, num_axis_dimensions, &num_resolved_axis))
92 {
93 return;
94 }
95
96 int temp_index[5];
97 // Reset input iterator.
98 for (int idx = 0; idx < input_num_dims; ++idx)
99 {
100 temp_index[idx] = 0;
101 }
102 // Iterate through input_data.
103 do
104 {
105 size_t input_offset = reducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
106 size_t output_offset =
107 reducedOutputOffset(input_num_dims, input_dims, temp_index, num_resolved_axis, axis);
108 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
109 } while (nextIndex(input_num_dims, input_dims, temp_index));
110}
bool nextIndex(const int num_dims, const int *dims, int *current)
Definition PALUtils.h:148
size_t reducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
Definition PALUtils.h:116

References nextIndex(), and reducedOutputOffset().

◆ ReLUCommon()

void luci_interpreter_pal::ReLUCommon ( const int  flat_size,
const float *  input_data,
float *  output_data,
const float  alpha,
const bool  is_relu_6 
)
inline

Definition at line 26 of file PALReluCommon.h.

28{
29 const float relu_6_value = 6.0f;
30 for (int i = 0; i < flat_size; i++)
31 {
32 const float val = input_data[i];
33 float result = val > 0 ? val : val * alpha;
34 result = is_relu_6 ? (result > relu_6_value ? relu_6_value : result) : result;
35 output_data[i] = result;
36 }
37}

Referenced by luci_interpreter::execute_kernel_CircleLeakyRelu(), luci_interpreter::execute_kernel_CircleRelu(), and luci_interpreter::execute_kernel_CircleRelu6().

◆ ResizeNearestNeighbor()

template<typename T >
void luci_interpreter_pal::ResizeNearestNeighbor ( const ResizeNearestNeighborParams op_params,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape output_size_shape,
const int32_t *  output_size_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 47 of file PALResizeNearestNeighbor.h.

52{
53 const luci_interpreter::RuntimeShape input_shape =
54 luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
56 luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
57
58 int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
59 int32_t input_height = input_shape.dims(1);
60 int32_t input_width = input_shape.dims(2);
61 int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
62
63 int32_t output_height = output_size_data[0];
64 int32_t output_width = output_size_data[1];
65
66 const int col_offset = input_shape.dims(3);
67 const int row_offset = input_shape.dims(2) * col_offset;
68 const int batch_offset = input_shape.dims(1) * row_offset;
69
70 const T *input_ptr = input_data;
71 T *output_ptr = output_data;
72 for (int b = 0; b < batches; ++b)
73 {
74 for (int y = 0; y < output_height; ++y)
75 {
76 int32_t in_y = getNearestNeighbor(y, input_height, output_height, op_params.align_corners,
77 op_params.half_pixel_centers);
78 const T *y_input_ptr = input_ptr + in_y * row_offset;
79 for (int x = 0; x < output_width; ++x)
80 {
81 int32_t in_x = getNearestNeighbor(x, input_width, output_width, op_params.align_corners,
82 op_params.half_pixel_centers);
83 const T *x_input_ptr = y_input_ptr + in_x * col_offset;
84 memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
85 output_ptr += depth;
86 }
87 }
88 input_ptr += batch_offset;
89 }
90}
int32_t getNearestNeighbor(const int input_value, const int32_t input_size, const int32_t output_size, const bool align_corners, const bool half_pixel_centers)

References luci_interpreter_pal::ResizeNearestNeighborParams::align_corners, luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), getNearestNeighbor(), luci_interpreter_pal::ResizeNearestNeighborParams::half_pixel_centers, MatchingDim(), output_shape, and ResizeNearestNeighbor().

Referenced by ResizeNearestNeighbor().

◆ Round()

void luci_interpreter_pal::Round ( const int32_t  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 37 of file PALRound.h.

38{
39 for (int i = 0; i < flat_size; ++i)
40 {
41 // Note that this implementation matches that of tensorFlow tf.round
42 // and corresponds to the bankers rounding method.
43 // cfenv (for fesetround) is not yet supported universally on Android, so
44 // using a work around.
45 output_data[i] = RoundToNearest(input_data[i]);
46 }
47}

References RoundToNearest().

Referenced by luci_interpreter::execute_kernel_CircleRound().

◆ roundingDivideByPOT()

int32_t luci_interpreter_pal::roundingDivideByPOT ( int32_t  x,
int32_t  exponent 
)
inline

Definition at line 65 of file PALUtils.h.

66{
67 assert(exponent >= 0);
68 assert(exponent <= 31);
69 const int32_t mask = int32_t((1ll << exponent) - 1);
70 const int32_t zero = int32_t(0);
71 const int32_t one = int32_t(1);
72 const int32_t remainder = x & mask;
73 const int32_t threshold = (mask >> 1) + ((x < zero ? one : zero) & one);
74 return (x >> exponent) + ((remainder > threshold ? one : zero) & one);
75}

Referenced by multiplyByQuantizedMultiplier(), and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ RoundToNearest()

float luci_interpreter_pal::RoundToNearest ( float  value)
inline

Definition at line 23 of file PALRound.h.

24{
25 auto floor_val = std::floor(value);
26 auto diff = value - floor_val;
27 if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0)))
28 {
29 return floor_val;
30 }
31 else
32 {
33 return floor_val + 1.0f;
34 }
35}

Referenced by Round().

◆ Rsqrt()

void luci_interpreter_pal::Rsqrt ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 27 of file PALRsqrt.h.

28{
29 for (int i = 0; i < flat_size; ++i)
30 {
31 output_data[i] = 1.f / std::sqrt(input_data[i]);
32 }
33}

Referenced by luci_interpreter::execute_kernel_CircleRsqrt().

◆ saturatingRoundingDoublingHighMul()

std::int32_t luci_interpreter_pal::saturatingRoundingDoublingHighMul ( std::int32_t  a,
std::int32_t  b 
)
inline

Definition at line 52 of file PALUtils.h.

53{
54 bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
55 std::int64_t a_64(a);
56 std::int64_t b_64(b);
57 std::int64_t ab_64 = a_64 * b_64;
58 std::int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
59 std::int32_t ab_x2_high32 = static_cast<std::int32_t>((ab_64 + nudge) / (1ll << 31));
60 return overflow ? std::numeric_limits<std::int32_t>::max() : ab_x2_high32;
61}

Referenced by multiplyByQuantizedMultiplier(), and multiplyByQuantizedMultiplierSmallerThanOneExp().

◆ Select()

template<typename D , typename T >
void luci_interpreter_pal::Select ( const luci_interpreter::RuntimeShape input_condition_shape,
const D *  input_condition_data,
const luci_interpreter::RuntimeShape input_x_shape,
const T *  input_x_data,
const luci_interpreter::RuntimeShape input_y_shape,
const T *  input_y_data,
const luci_interpreter::RuntimeShape output_shape,
T *  output_data 
)

Definition at line 27 of file PALSelectV2.h.

32{
33 int64_t flatsize;
34 // Allow select operator executions on mixed scalar tensors and one element
35 // tensors.
36 if (input_condition_shape.flatSize() == 1 && input_x_shape.flatSize() == 1 &&
37 input_y_shape.flatSize() == 1 && output_shape.flatSize() == 1)
38 {
39 flatsize = 1;
40 }
41 else
42 {
43 flatsize = input_condition_shape.flatSize();
44 }
45 for (int64_t i = 0; i < flatsize; ++i)
46 {
47 output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
48 }
49}

References luci_interpreter::RuntimeShape::flatSize(), and output_shape.

◆ Sin()

void luci_interpreter_pal::Sin ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 27 of file PALSinCommon.h.

28{
29 for (int i = 0; i < flat_size; ++i)
30 {
31 output_data[i] = std::sin(input_data[i]);
32 }
33}

Referenced by luci_interpreter::execute_kernel_CircleSin().

◆ Softmax() [1/4]

void luci_interpreter_pal::Softmax ( const SoftmaxParams params,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 25 of file PALSoftmaxCommon.h.

26{
27 const int outer_size = params.num_rows;
28 const int depth = params.row_size;
29 const double beta = params.beta;
30
31 for (int i = 0; i < outer_size; ++i)
32 {
33 // Find max element value which we'll use to ensure numerical stability
34 // taking advantage of the following equality:
35 // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
36 float max = std::numeric_limits<float>::lowest();
37 for (int c = 0; c < depth; ++c)
38 {
39 max = std::max(max, input_data[i * depth + c]);
40 }
41
42 // Compute sum.
43 float sum = 0.f;
44 for (int c = 0; c < depth; ++c)
45 {
46 const float exp_c = std::exp((input_data[i * depth + c] - max) * static_cast<float>(beta));
47 output_data[i * depth + c] = exp_c;
48 sum += exp_c;
49 }
50
51 // Compute result.
52 for (int c = 0; c < depth; ++c)
53 {
54 output_data[i * depth + c] = output_data[i * depth + c] / sum;
55 }
56 }
57}

References luci_interpreter_pal::SoftmaxParams::beta, luci_interpreter_pal::SoftmaxParams::num_rows, and luci_interpreter_pal::SoftmaxParams::row_size.

◆ Softmax() [2/4]

void luci_interpreter_pal::Softmax ( const SoftmaxParams params,
const int16_t *  input_data,
int16_t *  output_data 
)
inline

Definition at line 116 of file PALSoftmax.h.

117{
118 cmsis_nn_softmax_lut_s16 softmax_params{};
119
120 auto raw_exp_lut = std::make_unique<int16_t[]>(kInt16LUTArraySize);
121 auto one_over_one_plus_x_lut = std::make_unique<int16_t[]>(kInt16LUTArraySize);
122
123 // exp LUT only used on negative values
124 // we consider exp(-10.0) is insignificant to accumulation
125 const int32_t range = std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min();
126
127 LUTPopulate<int16_t>(
128 10.0f / range, std::numeric_limits<int16_t>::max(), 2.0f / range, 0,
129 [](float value) { return std::exp(value); }, raw_exp_lut.get());
130
131 LUTPopulate<int16_t>(
132 1.0f / range, std::numeric_limits<int16_t>::min(), 2.0f / range, 0,
133 [](float value) { return 1.0f / (1.0f + value); }, one_over_one_plus_x_lut.get());
134
135 softmax_params.exp_lut = raw_exp_lut.get();
136 softmax_params.one_by_one_lut = one_over_one_plus_x_lut.get();
137
138 arm_softmax_s16(input_data, params.num_rows, params.row_size, params.input_multiplier,
139 params.input_left_shift, &softmax_params, output_data);
140}

References luci_interpreter_pal::SoftmaxParams::input_left_shift, luci_interpreter_pal::SoftmaxParams::input_multiplier, luci_interpreter_pal::SoftmaxParams::num_rows, and luci_interpreter_pal::SoftmaxParams::row_size.

◆ Softmax() [3/4]

void luci_interpreter_pal::Softmax ( const SoftmaxParams params,
const int8_t *  input_data,
int16_t *  output_data 
)
inline

◆ Softmax() [4/4]

void luci_interpreter_pal::Softmax ( const SoftmaxParams params,
const int8_t *  input_data,
int8_t *  output_data 
)
inline

◆ Softmax< int8_t >()

template<>
void luci_interpreter_pal::Softmax< int8_t > ( const tflite::SoftmaxParams &  params,
const tflite::RuntimeShape &  input_shape,
const int8_t *  input_data,
const tflite::RuntimeShape &  output_shape,
int8_t *  output_data 
)
inline

Definition at line 63 of file PALSoftmax.h.

66{
67 const int trailing_dim = input_shape.DimensionsCount() - 1;
68 const int outer_size = tflite::MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
69 const int depth = tflite::MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
70 const int32_t mult = params.input_multiplier;
71 const int32_t shift = params.input_left_shift;
72 const int32_t diff_min = params.diff_min;
73
74 arm_softmax_s8(input_data, outer_size, depth, mult, shift, diff_min, output_data);
75}

References output_shape.

◆ SpaceToBatchND()

template<typename T >
void luci_interpreter_pal::SpaceToBatchND ( const int32_t  pad_value,
const luci_interpreter::RuntimeShape unextended_input1_shape,
const T *  input1_data,
const luci_interpreter::RuntimeShape unextended_input2_shape,
const int32_t *  block_shape_data,
const luci_interpreter::RuntimeShape unextended_input3_shape,
const int32_t *  paddings_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 46 of file PALSpaceToBatchND.h.

52{
53 // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
54 const luci_interpreter::RuntimeShape input1_shape =
55 extendShapeSpaceToBatch(unextended_input1_shape);
57 extendShapeSpaceToBatch(unextended_output_shape);
58
59 const int depth = input1_shape.dims(3);
60 const int input_width = input1_shape.dims(2);
61 const int input_height = input1_shape.dims(1);
62 const int input_batch_size = input1_shape.dims(0);
63
64 const int output_width = output_shape.dims(2);
65 const int output_height = output_shape.dims(1);
66 const int output_batch_size = output_shape.dims(0);
67
68 const int block_shape_height = block_shape_data[0];
69 const int block_shape_width =
70 unextended_input1_shape.dimensionsCount() == 4 ? block_shape_data[1] : 1;
71 const int padding_top = paddings_data[0];
72 const int padding_left = unextended_input1_shape.dimensionsCount() == 4 ? paddings_data[2] : 0;
73
74 for (int out_b = 0; out_b < output_batch_size; ++out_b)
75 {
76 int input_batch = out_b % input_batch_size;
77 int shift_w = (out_b / input_batch_size) % block_shape_width;
78 int shift_h = (out_b / input_batch_size) / block_shape_width;
79 for (int out_h = 0; out_h < output_height; ++out_h)
80 {
81 for (int out_w = 0; out_w < output_width; ++out_w)
82 {
83 T *out = output_data + offset(output_shape.dimsData(), out_b, out_h, out_w, 0);
84 if (out_h * block_shape_height + shift_h < padding_top ||
85 out_h * block_shape_height + shift_h >= padding_top + input_height ||
86 out_w * block_shape_width + shift_w < padding_left ||
87 out_w * block_shape_width + shift_w >= padding_left + input_width)
88 {
89 // This may not execute correctly when pad_value != 0 and T != uint8.
90 memset(out, pad_value, depth * sizeof(T));
91 }
92 else
93 {
94 const T *in =
95 input1_data + offset(input1_shape.dimsData(), input_batch,
96 (out_h * block_shape_height + shift_h) - padding_top,
97 (out_w * block_shape_width + shift_w) - padding_left, 0);
98 memcpy(out, in, depth * sizeof(T));
99 }
100 }
101 }
102 }
103}

References luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), offset(), and output_shape.

◆ SpaceToDepth()

template<typename T >
void luci_interpreter_pal::SpaceToDepth ( const int32_t  block_size,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 29 of file PALSpaceToDepth.h.

32{
33 const luci_interpreter::RuntimeShape input_shape =
34 luci_interpreter::RuntimeShape::extendedShape(4, unextended_input_shape);
36 luci_interpreter::RuntimeShape::extendedShape(4, unextended_output_shape);
37
38 const int input_depth = input_shape.dims(3);
39 const int input_width = input_shape.dims(2);
40 const int input_height = input_shape.dims(1);
41 const int input_batch = input_shape.dims(0);
42
43 for (int in_b = 0; in_b < input_batch; ++in_b)
44 {
45 for (int in_h = 0; in_h < input_height; ++in_h)
46 {
47 for (int in_w = 0; in_w < input_width; ++in_w)
48 {
49 for (int in_d = 0; in_d < input_depth; ++in_d)
50 {
51 const int out_d =
52 in_d + ((in_h % block_size) * block_size + in_w % block_size) * input_depth;
53 const int out_w = in_w / block_size;
54 const int out_h = in_h / block_size;
55 const int out_b = in_b;
56
57 const int input_index = offset(input_shape.dimsData(), in_b, in_h, in_w, in_d);
58 const int output_index = offset(output_shape.dimsData(), out_b, out_h, out_w, out_d);
59
60 output_data[output_index] = input_data[input_index];
61 }
62 }
63 }
64 }
65}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::extendedShape(), offset(), and output_shape.

◆ Sqrt()

void luci_interpreter_pal::Sqrt ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 27 of file PALSqrt.h.

28{
29 for (int i = 0; i < flat_size; ++i)
30 {
31 output_data[i] = std::sqrt(input_data[i]);
32 }
33}

Referenced by luci_interpreter::execute_kernel_CircleSqrt().

◆ Square()

void luci_interpreter_pal::Square ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 26 of file PALSquareCommon.h.

27{
28 for (int i = 0; i < flat_size; ++i)
29 {
30 output_data[i] = input_data[i] * input_data[i];
31 }
32}

Referenced by luci_interpreter::execute_kernel_CircleSquare().

◆ SquaredDifference()

void luci_interpreter_pal::SquaredDifference ( const int  flat_size,
const float *  input_data_1,
const float *  input_data_2,
float *  output_data 
)
inline

Definition at line 27 of file PALSquaredDifference.h.

29{
30 for (int i = 0; i < flat_size; ++i)
31 {
32 float diff = input_data_1[i] - input_data_2[i];
33 output_data[i] = diff * diff;
34 }
35}

Referenced by luci_interpreter::execute_kernel_CircleSquaredDifference().

◆ StridedSlice()

template<typename T >
void luci_interpreter_pal::StridedSlice ( StridedSliceParams op_params,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
T *  output_data 
)
inline

Definition at line 205 of file PALStridedSlice.h.

208{
209 const luci_interpreter::RuntimeShape input_shape =
210 luci_interpreter::RuntimeShape::extendedShape(5, unextended_input_shape);
211
212 // Reverse and pad to 5 dimensions because that is what the runtime code
213 // requires (ie. all shapes must be 5D and are given backwards).
214 stridedSlicePadIndices(&op_params, 5);
215
216 const int start_0 = startForAxis(op_params, input_shape, 0);
217 const int stop_0 = stopForAxis(op_params, input_shape, 0, start_0);
218 const int start_1 = startForAxis(op_params, input_shape, 1);
219 const int stop_1 = stopForAxis(op_params, input_shape, 1, start_1);
220 const int start_2 = startForAxis(op_params, input_shape, 2);
221 const int stop_2 = stopForAxis(op_params, input_shape, 2, start_2);
222 const int start_3 = startForAxis(op_params, input_shape, 3);
223 const int stop_3 = stopForAxis(op_params, input_shape, 3, start_3);
224 const int start_4 = startForAxis(op_params, input_shape, 4);
225 const int stop_4 = stopForAxis(op_params, input_shape, 4, start_4);
226
227 for (int offset_0 = start_0 * input_shape.dims(1), end_0 = stop_0 * input_shape.dims(1),
228 step_0 = op_params.strides[0] * input_shape.dims(1);
229 !loopCondition(offset_0, end_0, op_params.strides[0]); offset_0 += step_0)
230 {
231 for (int offset_1 = (offset_0 + start_1) * input_shape.dims(2),
232 end_1 = (offset_0 + stop_1) * input_shape.dims(2),
233 step_1 = op_params.strides[1] * input_shape.dims(2);
234 !loopCondition(offset_1, end_1, op_params.strides[1]); offset_1 += step_1)
235 {
236 for (int offset_2 = (offset_1 + start_2) * input_shape.dims(3),
237 end_2 = (offset_1 + stop_2) * input_shape.dims(3),
238 step_2 = op_params.strides[2] * input_shape.dims(3);
239 !loopCondition(offset_2, end_2, op_params.strides[2]); offset_2 += step_2)
240 {
241 for (int offset_3 = (offset_2 + start_3) * input_shape.dims(4),
242 end_3 = (offset_2 + stop_3) * input_shape.dims(4),
243 step_3 = op_params.strides[3] * input_shape.dims(4);
244 !loopCondition(offset_3, end_3, op_params.strides[3]); offset_3 += step_3)
245 {
246 for (int offset_4 = offset_3 + start_4, end_4 = offset_3 + stop_4;
247 !loopCondition(offset_4, end_4, op_params.strides[4]);
248 offset_4 += op_params.strides[4])
249 {
250 *output_data++ = input_data[offset_4];
251 }
252 }
253 }
254 }
255 }
256}

References luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::extendedShape(), and luci_interpreter_pal::StridedSliceParams::strides.

Referenced by luci_interpreter::execute_kernel_CircleStridedSlice().

◆ subscriptToIndex() [1/2]

int luci_interpreter_pal::subscriptToIndex ( const NdArrayDesc< 4 > &  desc,
int  i0,
int  i1,
int  i2,
int  i3 
)
inline

◆ subscriptToIndex() [2/2]

int luci_interpreter_pal::subscriptToIndex ( const NdArrayDesc< 5 > &  desc,
int  indexes[5] 
)
inline

Definition at line 130 of file ProcessBroadcastShapes.h.

131{
132 return indexes[0] * desc.strides[0] + indexes[1] * desc.strides[1] +
133 indexes[2] * desc.strides[2] + indexes[3] * desc.strides[3] + indexes[4] * desc.strides[4];
134}

References luci_interpreter_pal::NdArrayDesc< N >::strides.

◆ SVDF()

void luci_interpreter_pal::SVDF ( const float *  input_data,
const float *  weights_feature_data,
const float *  weights_time_data,
const float *  bias_data,
float *  state_data,
float *  scratch_data,
float *  output_data,
const int  rank,
const int  input_size,
const int  batch_size,
const int  num_filters,
const int  num_units,
const int  memory_size,
const circle::ActivationFunctionType  activation 
)
inline

Definition at line 133 of file PALSVDFCommon.h.

138{
139 // Left shift the activation_state.
140 {
141 float *new_state_start = state_data;
142 const float *old_state_start = state_data + 1;
143 const float *old_state_end = state_data + batch_size * num_filters * memory_size;
144 while (old_state_start != old_state_end)
145 {
146 *new_state_start++ = *old_state_start++;
147 }
148 }
149
150 // Note: no need to clear the latest activation, matmul is not accumulative.
151
152 // Compute conv1d(inputs, weights_feature).
153 // The activation_state's rightmost column is used to save current cycle
154 // activation. This is achieved by starting at state_ptr[memory_size - 1] and
155 // having the stride equal to memory_size.
156
157 // Perform batched matrix vector multiply operation:
158 {
159 const float *matrix = weights_feature_data;
160 const float *vector = input_data;
161 float *result = &state_data[memory_size - 1];
162 float *result_in_batch = result;
163 for (int i = 0; i < batch_size; ++i)
164 {
165 const float *matrix_ptr = matrix;
166 for (int j = 0; j < num_filters; ++j)
167 {
168 float dot_prod = 0.0f;
169 const float *vector_in_batch = vector + i * input_size;
170 for (int k = 0; k < input_size; ++k)
171 {
172 dot_prod += *matrix_ptr++ * *vector_in_batch++;
173 }
174 *result_in_batch = dot_prod;
175 result_in_batch += memory_size;
176 }
177 }
178 }
179
180 applyTimeWeightsBiasAndActivation(batch_size, memory_size, num_filters, num_units, rank,
181 weights_time_data, bias_data, activation, state_data,
182 scratch_data, output_data);
183}

Referenced by luci_interpreter::execute_kernel_CircleSVDF().

◆ Tanh() [1/2]

void luci_interpreter_pal::Tanh ( const int  flat_size,
const float *  input_data,
float *  output_data 
)
inline

Definition at line 26 of file PALTanh.h.

27{
28 for (int i = 0; i < flat_size; i++)
29 {
30 float val = input_data[i];
31 float result = std::tanh(val);
32 output_data[i] = result;
33 }
34}

Referenced by luci_interpreter::evalInteger(), luci_interpreter::execute_kernel_CircleTanh(), luci_interpreter_pal::lstm_internal::tanh(), and luci_interpreter_pal::lstm_internal::tanh().

◆ Tanh() [2/2]

void luci_interpreter_pal::Tanh ( int32_t  input_multiplier,
int32_t  input_left_shift,
const int  flat_size,
const int16_t *  ptr_input_data,
int16_t *  ptr_output_data 
)
inline

Definition at line 36 of file PALTanh.h.

38{
39 // We use the LUT for sigmoid and take into account, that
40 // tanh(x) = 2*sigmoid(2*x) - 1
41
42 // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
43 // In case of general parameter scale, multiplier 3 is taken into account
44 // in TanhPrepare function and it is included in
45 // input_multiplier already.
46
47 if (input_multiplier == 0)
48 { // power of two case
49 input_multiplier = 3 << input_left_shift;
50 input_left_shift = 0;
51 }
52
53 int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
54
55 for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++)
56 {
57 int32_t input_data = ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
58
59 uint32_t abs_input_data = abs(input_data);
60 uint32_t uh = abs_input_data >> 8;
61 int32_t result;
62
63 if (uh >= 255)
64 {
65 // Saturate to maximum.
66 result = 0xFFFF << 8;
67 }
68 else
69 {
70 uint32_t ua = sigmoid_table_uint16[uh];
71 uint32_t ub = sigmoid_table_uint16[uh + 1];
72
73 uint8_t ut = abs_input_data & 0xFF;
74
75 result = (ua << 8) + ut * (ub - ua);
76 }
77
78 result = (input_data >= 0) ? (result - (1 << (14 + 9)) + (1 << (9 - 2)))
79 : (-result + (1 << (14 + 9)) + (1 << (9 - 2)) - 1);
80
81 // Convert back to 16-bit.
82 result >>= (9 - 1);
83
84 *ptr_output_data = result;
85 }
86}

◆ Transpose()

template<typename T , int N = 5>
void luci_interpreter_pal::Transpose ( const TransposeParams params,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)

Definition at line 70 of file PALTranspose.h.

73{
74 // Transpose kernel only does rearranging values not numeric evaluations on
75 // each cell. It's safe to implement per size of scalar type and this trick
76 // keeps the total code size in a reasonable range.
77 switch (sizeof(T))
78 {
79 case 1:
80 TransposeImpl<int8_t, N>(params, unextended_input_shape,
81 reinterpret_cast<const int8_t *>(input_data),
82 unextended_output_shape, reinterpret_cast<int8_t *>(output_data));
83 break;
84 case 2:
85 TransposeImpl<int16_t, N>(params, unextended_input_shape,
86 reinterpret_cast<const int16_t *>(input_data),
87 unextended_output_shape, reinterpret_cast<int16_t *>(output_data));
88 break;
89
90 case 4:
91 TransposeImpl<int32_t, N>(params, unextended_input_shape,
92 reinterpret_cast<const int32_t *>(input_data),
93 unextended_output_shape, reinterpret_cast<int32_t *>(output_data));
94 break;
95 case 8:
96 TransposeImpl<int64_t, N>(params, unextended_input_shape,
97 reinterpret_cast<const int64_t *>(input_data),
98 unextended_output_shape, reinterpret_cast<int64_t *>(output_data));
99 break;
100 }
101}

Referenced by luci_interpreter::execute_kernel_CircleTranspose().

◆ TransposeConv()

void luci_interpreter_pal::TransposeConv ( const ConvParams params,
const luci_interpreter::RuntimeShape input_shape,
const float *  input_data,
const luci_interpreter::RuntimeShape filter_shape,
const float *  filter_data,
const luci_interpreter::RuntimeShape bias_shape,
const float *  bias_data,
const luci_interpreter::RuntimeShape output_shape,
float *  output_data 
)
inline

Definition at line 26 of file PALTransposeConv.h.

33{
34 const int stride_width = params.stride_width;
35 const int stride_height = params.stride_height;
36 const int pad_width = params.padding_values.width;
37 const int pad_height = params.padding_values.height;
38
39 const int batches = input_shape.dims(0);
40 const int input_depth = input_shape.dims(3);
41 const int output_depth = filter_shape.dims(0);
42 const int input_height = input_shape.dims(1);
43 const int input_width = input_shape.dims(2);
44 const int filter_height = filter_shape.dims(1);
45 const int filter_width = filter_shape.dims(2);
46 const int output_height = output_shape.dims(1);
47 const int output_width = output_shape.dims(2);
48 const float output_activation_min = params.float_activation_min;
49 const float output_activation_max = params.float_activation_max;
50
51 // Although transpose convolution simplifies to convolution with transposed
52 // weights for strides of 1, non-unitary striding complicates matters. To
53 // keep this reference implementation as clear as possible, we use a
54 // "scatter" access pattern, where we loop through all the input elements,
55 // computing their influence on the output, rather than looping through the
56 // output elements in the typical "gather" access pattern of a conv. We
57 // therefore must initialize the output array to zero.
58 const int num_elements = output_shape.flatSize();
59 for (int i = 0; i < num_elements; i++)
60 {
61 output_data[i] = 0.0f;
62 }
63
64 // Loop through input elements one at a time.
65 for (int batch = 0; batch < batches; ++batch)
66 {
67 for (int in_y = 0; in_y < input_height; ++in_y)
68 {
69 for (int in_x = 0; in_x < input_width; ++in_x)
70 {
71 for (int in_channel = 0; in_channel < input_depth; ++in_channel)
72 {
73 // Loop through the output elements it will influence
74 const int out_x_origin = (in_x * stride_width) - pad_width;
75 const int out_y_origin = (in_y * stride_height) - pad_height;
76 for (int filter_y = 0; filter_y < filter_height; ++filter_y)
77 {
78 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
79 {
80 for (int out_channel = 0; out_channel < output_depth; ++out_channel)
81 {
82 // Compute output element location
83 const int out_x = out_x_origin + filter_x;
84 const int out_y = out_y_origin + filter_y;
85 // We cannot accumulate out of bounds
86 if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
87 (out_y < output_height))
88 {
89 float input_value =
90 input_data[offset(input_shape.dimsData(), batch, in_y, in_x, in_channel)];
91 float filter_value = filter_data[offset(filter_shape.dimsData(), out_channel,
92 filter_y, filter_x, in_channel)];
93 output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)] +=
94 input_value * filter_value;
95 }
96 }
97 }
98 }
99 }
100 }
101 }
102 }
103
104 for (int batch = 0; batch < batches; ++batch)
105 {
106 for (int out_y = 0; out_y < output_height; ++out_y)
107 {
108 for (int out_x = 0; out_x < output_width; ++out_x)
109 {
110 for (int out_channel = 0; out_channel < output_depth; ++out_channel)
111 {
112 float acc =
113 output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)];
114 if (bias_data)
115 acc += bias_data[out_channel];
116
117 output_data[offset(output_shape.dimsData(), batch, out_y, out_x, out_channel)] =
118 activationFunctionWithMinMax(acc, output_activation_min, output_activation_max);
119 }
120 }
121 }
122 }
123}

References activationFunctionWithMinMax(), luci_interpreter::RuntimeShape::dims(), luci_interpreter::RuntimeShape::dimsData(), luci_interpreter::RuntimeShape::flatSize(), luci_interpreter_pal::ConvParams::float_activation_max, luci_interpreter_pal::ConvParams::float_activation_min, luci_interpreter_pal::PaddingValues::height, offset(), output_shape, luci_interpreter_pal::ConvParams::padding_values, luci_interpreter_pal::ConvParams::stride_height, luci_interpreter_pal::ConvParams::stride_width, and luci_interpreter_pal::PaddingValues::width.

◆ TransposeImpl()

template<typename T , int N>
void luci_interpreter_pal::TransposeImpl ( const TransposeParams params,
const luci_interpreter::RuntimeShape unextended_input_shape,
const T *  input_data,
const luci_interpreter::RuntimeShape unextended_output_shape,
T *  output_data 
)

Definition at line 27 of file PALTranspose.h.

31{
32 const int unextended_input_size = unextended_input_shape.dimensionsCount();
33 const int unextended_output_size = unextended_output_shape.dimensionsCount();
34
35 const int input_ext_size = N - unextended_input_size;
36 const int output_ext_size = N - unextended_output_size;
37 NdArrayDesc<N> input_desc;
38 NdArrayDesc<N> output_desc;
40 &input_desc);
42 &output_desc);
43
44 // The perm data is extended to match the output, each index incremented by
45 // the amount of front padding of the input shape.
46 int extended_perm[N];
47 for (int i = 0; i < N; ++i)
48 {
49 extended_perm[i] = i < output_ext_size ? i : params.perm[i - output_ext_size] + input_ext_size;
50 }
51
52 // Permutes the input shape so we don't need to permute the indexes inside
53 // the loop. Check to make sure output_dims is matching input_dims.
54 NdArrayDesc<N> perm_input_desc;
55 for (int k = 0; k < N; ++k)
56 {
57 perm_input_desc.extents[k] = input_desc.extents[extended_perm[k]];
58 perm_input_desc.strides[k] = input_desc.strides[extended_perm[k]];
59 }
60
61 // Naive transpose loop (iterate on output index and compute input index).
62 auto tranpose_func = [&](int indexes[N]) {
63 output_data[subscriptToIndex(output_desc, indexes)] =
64 input_data[subscriptToIndex(perm_input_desc, indexes)];
65 };
66 NDOpsHelper<N>(output_desc, tranpose_func);
67}
int strides[N]
Definition NDArray.h:45
int extents[N]
Definition NDArray.h:41

References copyDimsToDesc(), luci_interpreter::RuntimeShape::dimensionsCount(), luci_interpreter::RuntimeShape::extendedShape(), luci_interpreter_pal::NdArrayDesc< N >::extents, luci_interpreter_pal::TransposeParams::perm, luci_interpreter_pal::NdArrayDesc< N >::strides, and subscriptToIndex().

Variable Documentation

◆ MAX_INDICES_ND

constexpr int luci_interpreter_pal::MAX_INDICES_ND = 5
constexpr

Definition at line 27 of file PALGatherND.h.

Referenced by luci_interpreter::configure_kernel_CircleGatherND(), and GatherND().