ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker Namespace Reference

Namespaces

namespace  bias_op
 
namespace  cpu_backend_threadpool
 
namespace  depthwise_conv_op
 
namespace  detail
 
namespace  eigen_support
 
namespace  functor
 
namespace  gemm_support
 
namespace  multithreaded
 
namespace  optimized
 
namespace  optimized_integer_ops
 
namespace  random
 
namespace  reference
 
namespace  reference_integer_ops
 
namespace  ruy_support
 
namespace  train
 
namespace  training_ops
 
namespace  xent_ops
 

Data Structures

class  ActivationFunctor
 
class  BatchMatMul
 
struct  BatchMatMulParams
 
class  BCast
 
class  BCastList
 
struct  BinaryArithmeticOpParam
 
struct  ComparisonParams
 
struct  ConcatenationParams
 
class  Conv
 
struct  ConvHybridTempArena
 
struct  ConvParams
 
struct  DepthwiseConvParams
 
struct  DepthwiseConvWorkerTask
 
class  Einsum
 
class  FCTempArena
 
struct  FullyConnectedParams
 
class  FusedBatchNorm
 
struct  FusedBatchNormParams
 
struct  GatherParams
 
struct  GemmParams
 
struct  InputTensor
 
struct  InstanceNormParams
 
struct  is_quant8
 
struct  L2NormParams
 
struct  LeakyReluParams
 
struct  LSTMParams
 
class  MatMulBCast
 
struct  MatrixParams
 
struct  MaximumOp
 
struct  MinimumOp
 
struct  NdArrayDesc
 
struct  PackParams
 
struct  PaddingValues
 
struct  PadParams
 
struct  PoolParams
 
class  Reduce
 
class  ReduceMean
 
struct  ResizeBilinearParams
 
struct  RmsNormParams
 
class  SequentialTensorWriter
 
class  Shape
 
struct  ShapeIterator
 
struct  SliceParams
 
struct  SoftmaxParams
 
struct  SpaceToBatchParams
 
struct  SpaceToDepthParams
 
struct  SplitParams
 
struct  SplitVParams
 
struct  StridedSliceParams
 
struct  Tensor
 
struct  TransposeConvParams
 
struct  TransposeParams
 
struct  TTypes
 
struct  UnpackParams
 
struct  UNUSED_ALL
 

Typedefs

template<typename Scalar >
using VectorMap = typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, 1 > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, 1 > > >::type
 
template<typename Scalar >
using MatrixMap = typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, Eigen::Dynamic > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic > > >::type
 
template<typename T >
using ComparisonFn = bool(*)(T, T)
 
using ShapeVec = std::vector< int32_t >
 
using Labels = std::vector< int32_t >
 
using OperandLabels = std::vector< Labels >
 
using LabelCounts = std::vector< int32_t >
 
using OperandLabelCounts = std::vector< LabelCounts >
 
using LabelToDimSizes = std::vector< int32_t >
 
typedef Eigen::ThreadPoolDevice CPUDevice
 
typedef TTypes< float, 1 >::Tensor32Bit::Index Index32
 

Enumerations

enum  DimensionType {
  kBroadcasting = 0 , kBatch = 1 , kFree = 2 , kContract = 3 ,
  kReduce = 4
}
 
enum class  FusedActivationFunctionType {
  kNone = 0 , kRelu6 = 1 , kRelu1 = 2 , kRelu = 3 ,
  kTanh = 4 , kSigmoid = 6
}
 
enum class  PaddingType { kNone = 0 , kSame = 1 , kValid = 2 }
 
enum class  BinaryArithmeticOpType {
  ADD = 0 , SUB = 1 , MUL = 2 , DIV = 3 ,
  POW = 4
}
 
enum class  ComparisonOpType {
  Equal , NotEqual , Greater , GreaterEqual ,
  Less , LessEqual
}
 
enum class  RoPEMode { kGptNeox = 0 , kGptJ = 1 }
 
enum class  BroadcastableOpCategory : uint8_t {
  kNone , kNonBroadcast , kFirstInputBroadcastsFast , kSecondInputBroadcastsFast ,
  kGenericBroadcast
}
 
enum  LSTMKernelType { kTfLiteLSTMFullKernel = 0 , kTfLiteLSTMBasicKernel }
 
enum class  Order { kColMajor , kRowMajor }
 
enum class  CachePolicy : std::uint8_t { kNeverCache , kCacheIfLargeSpeedup , kAlwaysCache }
 
enum class  QuantizationFlavor { kFloatingPoint , kIntegerWithUniformMultiplier , kIntegerWithPerRowMultiplier }
 

Functions

template<typename Scalar >
VectorMap< Scalar > MapAsVector (Scalar *data, const Shape &shape)
 
template<typename Scalar >
MatrixMap< Scalar > MapAsMatrixWithLastDimAsRows (Scalar *data, const Shape &shape)
 
template<typename T >
void AddN (const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data)
 
template<typename T1 , typename T2 , typename Cmp >
void ArgMinMax (const Shape &input1_shape, const T1 *input1_data, const Shape &output_shape, T2 *output_data, int32_t axis, const Cmp &cmp)
 
template<typename T >
void AveragePool (const PoolParams &, const Shape &, const T *, const Shape &, T *)
 
template<>
void AveragePool< float > (const PoolParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void AveragePool16 (const PoolParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
void AveragePool32 (const PoolParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
template<>
void AveragePool< uint8_t > (const PoolParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
template<>
void AveragePool< int8_t > (const PoolParams &params, const Shape &input_shape, const int8_t *input_data, const Shape &output_shape, int8_t *output_data)
 
void GetIndexRange (int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim, int *start_index, int *end_index)
 
template<typename T >
void BatchToSpaceND (const Shape &unextended_input1_shape, const T *input1_data, const int32_t *block_shape_data, const int32_t *crops_data, const Shape &unextended_output_shape, T *output_data)
 
bool ProcessBroadcastShapes (const Shape &shape0, const Shape &shape1, BinaryArithmeticOpParam *params)
 
template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&!std::is_same< T, bool >::value > BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&std::is_same< T, bool >::value > BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value > BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<BinaryArithmeticOpType op_type>
void BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value > BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value > BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<BinaryArithmeticOpType op_type>
void BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
template<typename T >
void BroadcastTo (const Shape &input_shape, T *input_data, const Shape &output_shape, T *output_data)
 
void BiasAndClamp (float clamp_min, float clamp_max, int bias_size, const float *bias_data, int array_size, float *array_data)
 
template<typename T >
bool EqualFn (T lhs, T rhs)
 
template<typename T >
bool NotEqualFn (T lhs, T rhs)
 
template<typename T >
bool GreaterFn (T lhs, T rhs)
 
template<typename T >
bool GreaterEqualFn (T lhs, T rhs)
 
template<typename T >
bool LessFn (T lhs, T rhs)
 
template<typename T >
bool LessEqualFn (T lhs, T rhs)
 
template<typename T , ComparisonFn< T > F>
void ComparisonImpl (const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data)
 
template<ComparisonFn< float > F>
void Comparison (const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, bool *output_data)
 
template<typename T , ComparisonFn< int32_t > F>
void ComparisonWithScaling (ComparisonParams &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data)
 
template<typename T , ComparisonFn< T > F>
void BroadcastComparison4DSlowImpl (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, bool *output_data)
 
template<typename T , ComparisonFn< T > F>
void BroadcastComparison4DSlow (const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data)
 
template<typename T , ComparisonFn< int32_t > F>
void BroadcastComparison4DSlowWithScaling (ComparisonParams &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data)
 
 TFLITE_COMPARISON_OP (Equal)
 
 TFLITE_COMPARISON_OP (NotEqual)
 
 TFLITE_COMPARISON_OP (Greater)
 
 TFLITE_COMPARISON_OP (GreaterEqual)
 
 TFLITE_COMPARISON_OP (Less)
 
 TFLITE_COMPARISON_OP (LessEqual)
 
template<typename Scalar >
void Concatenation (const ConcatenationParams &params, const Shape *const *input_shapes, const Scalar *const *input_data, const Shape &output_shape, Scalar *output_data)
 
void ConcatenationWithScaling (const ConcatenationParams &params, const Shape *const *input_shapes, const uint8_t *const *input_data, const Shape &output_shape, uint8_t *output_data)
 
template<typename T >
void DepthToSpace (const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data, int32_t block_size)
 
int HowManyConvThreads (const Shape &output_shape, const Shape &filter_shape)
 
bool MultithreadAlongBatches (int thread_count, int batches)
 
template<typename T , typename TS >
void DepthwiseConv (const DepthwiseConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const T *filter_data, const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, T *output_data, ruy::Context *ruy_context)
 
void DepthwiseConvOp (const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, float *padded_filter_data, bool pad_filter, float *filter_buffers_data, const Shape &output_shape, float *output_data)
 
void Dequantize (const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, float *output_data, const float scale, const int32_t zero_point)
 
void Dequantize (const Shape &input_shape, const int8_t *input_data, const Shape &output_shape, float *output_data, const float scale, const int32_t zero_point)
 
void Dequantize (const Shape &input_shape, const int16_t *input_data, const Shape &output_shape, float *output_data, const float scale, const int32_t zero_point)
 
void Sin (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Cos (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Abs (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Rsqrt (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T >
void Neg (const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
 
void Log (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Floor (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Sqrt (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Square (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ELU (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Erf (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void Exp (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T >
void Fill (const T *value_data, const Shape &output_shape, T *output_data)
 
template<typename T >
void FloorDivBroadcast (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void FloorDivElementwise (const Shape &shape, const T *input1_data, const T *input2_data, T *output_data)
 
template<typename T >
void FloorModBroadcast (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void FloorModElementwise (const Shape &shape, const T *input1_data, const T *input2_data, T *output_data)
 
void FullyConnected (const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)
 
void FullyConnected (const FullyConnectedParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
 
void FullyConnectedHybrid (const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
 
void FullyConnectedSparseWeightRandom (const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
 
void FullyConnectedSparseWeight16x1 (const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
 
template<typename T , typename CoordsT = int32_t>
void Gather (const GatherParams &op_params, const Shape &input_shape, const T *input_data, const Shape &coords_shape, const CoordsT *coords_data, const Shape &, T *output_data)
 
void ComputeBatchIndices (const int32_t output_batch_size, const std::vector< int32_t > &reshape, const std::vector< int32_t > &bcast, std::vector< int32_t > *out_indices)
 
template<typename DSizes >
Eigen::DSizes< Index32, DSizes::count > To32BitDims (const DSizes &in)
 
template<typename TensorType >
TTypes< typenameTensorType::Scalar, TensorType::NumIndices >::Tensor32Bit To32Bit (TensorType in)
 
void InstanceNorm (const InstanceNormParams &params, const Shape &input_shape, const float *input_data, const Shape &gamma_shape, const float *gamma_data, const Shape &beta_shape, const float *beta_data, const Shape &output_shape, float *output_data)
 
void L2NormalizeFloat32 (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void L2NormalizeQuant8 (L2NormParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
void LeakyReLU (const LeakyReluParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T >
void LogicalAndBroadcast (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void LogicalAndElementwise (const Shape &shape, const T *input1_data, const T *input2_data, T *output_data)
 
void LogicalNot (const Shape &input_shape, const bool *input_data, const Shape &output_shape, bool *output_data)
 
template<typename T >
void LogicalOrBroadcast (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void LogicalOrElementwise (const Shape &shape, const T *input1_data, const T *input2_data, T *output_data)
 
void Logistic (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void LogSoftmax (const SoftmaxParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void LogSoftmax (const SoftmaxParams &params, float input_scale, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
void CalculateLstmGateFloat (const float *input, const float *input_to_gate_weights, const float *aux_input, const float *aux_input_to_gate_weights, const float *output_state, const float *recurrent_to_gate_weights, const float *cell_state, const float *cell_to_gate_weights, const float *layer_norm_coefficients, const float *gate_bias, const int n_batch, const int n_input, const int n_aux_input, const int n_output, const int n_cell, const FusedActivationFunctionType activation, float *gate, const bool is_input_all_zeros, const bool is_aux_input_all_zeros)
 
void UpdateLstmCellFloat (int n_batch, int n_cell, float *cell_state, const float *input_gate, float *forget_gate, const float *cell_gate, bool use_cifg, float clip)
 
void CalculateLstmOutputFloat (int n_batch, int n_cell, int n_output, const float *cell_state, const float *output_gate, FusedActivationFunctionType activation, const float *projection_weights, const float *projection_bias, const float proj_clip, float *output_state, float *scratch)
 
void LstmStepFloat (const float *input_ptr, const float *input_to_input_weights_ptr, const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr, const float *input_to_output_weights_ptr, const float *aux_input_ptr, const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr, const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr, const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr, const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr, const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr, const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr, const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr, const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr, const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr, const float *output_gate_bias_ptr, const float *projection_weights_ptr, const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input, int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr, float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3, float *output_ptr)
 
template<typename T >
void MatrixBandPart (const T num_lower_diags, const T num_upper_diags, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T , typename Op >
void MaximumMinimumBroadcast4DSlow (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data, Op op)
 
template<typename T >
void Max (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void Min (const Shape &unextended_input1_shape, const T *input1_data, const Shape &unextended_input2_shape, const T *input2_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void MaxPool (const PoolParams &, const Shape &, const T *, const Shape &, T *)
 
template<>
void MaxPool< float > (const PoolParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<>
void MaxPool< uint8_t > (const PoolParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
template<typename T , typename TI >
void OneHot (const int32_t depth, const T on_value, const T off_value, int32_t axis, const Shape &indices_shape, const TI *indices_data, const Shape &, T *output_data)
 
template<typename Scalar >
void Pack (const PackParams &params, const Scalar *const *input_data, const Shape &output_shape, Scalar *output_data)
 
template<typename T >
void Pad (const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data, const T *constant_value_data)
 
template<typename T >
void powImpl (const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
template<typename InputT , typename OutputT >
void Quantize (const Shape &input_shape, const InputT *input_data, const Shape &output_shape, OutputT *output_data, const float output_scale, const int32_t output_offset)
 
template<>
void Quantize (const Shape &input_shape, const float *input_data, const Shape &output_shape, int8_t *output_data, const float scale, const int32_t zero_point)
 
template<>
void Quantize (const Shape &input_shape, const float *input_data, const Shape &output_shape, uint8_t *output_data, const float scale, const int32_t zero_point)
 
template<>
void Quantize (const Shape &input_shape, const float *input_data, const Shape &output_shape, int16_t *output_data, const float scale, const int32_t zero_point)
 
void Quantize (const int32_t *multiplier, const int32_t *shift, int32_t channel_size, int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max, int32_t *scratch, int8_t *output)
 
template<typename input_type , typename output_type >
void Requantize (const input_type *input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data)
 
template<>
void Requantize< uint8_t, int8_t > (const uint8_t *input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, int8_t *output_data)
 
template<>
void Requantize< int8_t, uint8_t > (const int8_t *input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, uint8_t *output_data)
 
template<typename T >
int GetSize (T start, T limit, T delta)
 
template<typename T >
void Range (const T *start_data, const T *limit_data, const T *delta_data, T *output_data)
 
template<typename In , typename Out >
bool ReduceImpl (const In *input_data, const Shape &input_shape, const Shape &, const int *axis, const int num_axis, int *input_iter, Out reducer(const Out current, const In in), Out *output_data)
 
bool ResolveAxis (const int num_dims, const std::vector< int > &axes, int *out_axis, int *out_num_axis)
 
template<typename T >
bool InitTensorDataForReduce (const Shape &shape, const T init_value, T *data)
 
float round_nearest (float value)
 
template<typename Out , typename In >
Out mean_reducer (const Out data1, const In data2, int normalizer)
 
template<typename In >
int sum_reducer (const int data1, const In data2)
 
template<typename In , typename Out >
bool ReduceMeanImpl (const In *input_data, const Shape &input_shape, const int *axis, const int num_axis, int *input_iter, Out reducer(const Out current, const In in, int normalizer), Out *output_data)
 
template<typename In >
size_t ReduceSumQuantImpl (const In *input_data, const Shape &input_shape, const int *axis, const int num_axis, int *input_iter, int reducer(const int current, const In in), int *temp_sum)
 
template<typename In , typename Out >
void Mean (const Shape &input_shape, const In *input_data, const Shape &output_shape, Out *output_data, const std::vector< int > &axes)
 
template<typename In , typename Out >
void MeanQ8Asymm (const Shape &input_shape, const In *input_data, float input_scale, int32_t input_offset, const Shape &output_shape, Out *output_data, float output_scale, int32_t output_offset, const std::vector< int > &axes)
 
template<typename In , typename Out >
void MeanAxis1And2 (const Shape &input_shape, const In *input_data, const Shape &output_shape, Out *output_data)
 
void ReLU (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ReLU6 (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ResizeBilinearKernel2x2 (int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x, int32_t y, int32_t depth, int32_t batch, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ResizeBilinear2x2 (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ResizeBilinearKernel (const float *input_ptr, int32_t depth, float scale, float *output_ptr)
 
void ComputeInterpolationValues (const float value, const float scale, const bool half_pixel_centers, int32_t input_size, float *scaled_value, int32_t *lower_bound, int32_t *upper_bound)
 
void ResizeBilinearGeneric (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, float height_scale, float width_scale, const Shape &input_shape, const float *input_data, float *output_data, const bool half_pixel_centers)
 
template<typename T >
void ResizeBilinearGenericSmallChannel (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, float height_scale, float width_scale, const Shape &input_shape, const T *input_data, T *output_data, const bool half_pixel_centers)
 
void ResizeBilinear (ResizeBilinearParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
void ResizeBilinear (ResizeBilinearParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 
void ComputeInterpolationValues (const int32_t value, const int32_t scale_10, const bool half_pixel_centers, int32_t input_size, int32_t *scaled_value, int32_t *lower_bound, int32_t *upper_bound)
 
void ResizeBilinear (const ResizeBilinearParams &op_params, const Shape &unextended_input_shape, const int8_t *input_data, const Shape &unextended_output_shape, int8_t *output_data)
 
template<typename Scalar >
void Reverse (int axis, const Shape &input_shape, const Scalar *input_data, const Shape &, Scalar *output_data)
 
void RmsNorm (const RmsNormParams &params, const Shape &input_shape, const float *input_data, const Shape &gamma_shape, const float *gamma_data, const Shape &output_shape, float *output_data)
 
template<typename T >
void RoPE (const RoPEMode mode, const Shape &input_shape, const T *input_data, const Shape &sin_table_shape, const T *sin_table_data, const Shape &cos_table_shape, const T *cos_table_data, const Shape &output_shape, T *output_data)
 
float RoundToNearest (float value)
 
void Round (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename D , typename T >
void Select (const Shape &input_condition_shape, const D *input_condition_data, const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape, const T *input_y_data, const Shape &output_shape, T *output_data)
 
template<typename D , typename T >
void RankOneSelect (const Shape &input_condition_shape, const D *input_condition_data, const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape, const T *input_y_data, const Shape &output_shape, T *output_data)
 
template<typename D , typename T >
void BroadcastSelect4DSlow (const Shape &input_condition_shape, const D *input_condition_data, const Shape &input_x_shape, const T *input_x_data, const Shape &input_y_shape, const T *input_y_data, const Shape &output_shape, T *output_data)
 
template<typename T >
void Slice (const SliceParams &op_params, const Shape &input_shape, SequentialTensorWriter< T > *writer)
 
template<typename T >
void Slice (const SliceParams &op_params, const Shape &input_shape, const T *input_data, T *output_data)
 
void Softmax (const float *in, const int input_size, const int batch_size, const float beta, float *out)
 
void Softmax (const SoftmaxParams &params, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T >
int32_t QuantizeSoftmaxOutput (float prob_rescaled, int32_t zero_point)
 
template<>
int32_t QuantizeSoftmaxOutput< uint8_t > (float prob_rescaled, int32_t)
 
void PopulateSoftmaxLookupTable (float *table, float input_scale, float beta)
 
template<typename In , typename Out >
void Softmax (const SoftmaxParams &params, const Shape &input_shape, const In *input_data, const Shape &output_shape, Out *output_data)
 
template<typename T >
void SpaceToBatchND (const SpaceToBatchParams &params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_block_shape_shape, const int32_t *block_shape_data, const Shape &unextended_padding_shape, const int32_t *paddings_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename T >
void SpaceToDepth (const SpaceToDepthParams &params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data)
 
template<typename Scalar >
void Split (const SplitParams &params, const Shape &input_shape, const Scalar *input_data, const Shape &output_shape, Scalar *const *output_data)
 
template<typename Scalar >
void SplitV (const SplitVParams &params, const Shape &input_shape, const Scalar *input_data, std::vector< nnfw::cker::Shape > &output_shapes, Scalar *const *output_data)
 
template<typename T , int N>
void SqDiffImpl (const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, NdArrayDesc< N > *desc1_in, NdArrayDesc< N > *desc2_in, NdArrayDesc< N > *desc_out)
 
template<typename T >
void SqDiff (const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
void GenerateKey (Tensor seed, random::PhiloxRandom::Key *out_key, random::PhiloxRandom::ResultType *out_counter)
 
template<typename Device , class Distribution >
void Fill (random::PhiloxRandom random, Tensor *output)
 
void StatelessRandomUniform (const Shape &shape_shape, const int32_t *shape_data, const Shape &seed_shape, const int32_t *seed_data, const Shape &output_shape, float *output_data)
 
int Clamp (const int v, const int lo, const int hi)
 
void StridedSlicePadIndices (StridedSliceParams *p, int dim_count)
 
int StartForAxis (const StridedSliceParams &params, const Shape &input_shape, int axis)
 
int StopForAxis (const StridedSliceParams &params, const Shape &input_shape, int axis, int start_for_axis)
 
bool LoopCondition (int index, int stop, int stride)
 
template<typename T >
StridedSliceParams buildStridedSliceParams (const T *begin, const T *end, const T *strides, const uint32_t begin_mask, const uint32_t end_mask, const uint32_t shrink_axis_mask, const uint8_t rank)
 
void checkOutputSize (const StridedSliceParams &op_params, const Shape &input_shape, const Shape &output_shape, uint32_t rank)
 
template<typename T >
void StridedSlice (const StridedSliceParams &op_params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data)
 
void Tanh (const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
 
template<typename T , typename M >
void CopyMultipleTimes (const T *in_data, int32_t in_size, M multiplier, T *out_data)
 
template<typename T , typename M >
std::pair< int, int > TileOneDimension (const Shape &in_dimensions, const T *in_data, const M *multipliers, T *out_data, int dimension)
 
template<typename T >
void Transpose2D (const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
 
template<typename T >
void Transpose3D (const TransposeParams &params, const Shape &input_shape, const T *input_data, const Shape &, T *output_data)
 
template<typename T >
void TransposeImpl (const TransposeParams &params, const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
 
template<typename T >
void Transpose (const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape, const T *input_data, const Shape &unshrunk_output_shape, T *output_data)
 
void TransposeConv (const TransposeConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &output_shape, float *output_data)
 
template<typename Scalar >
void Unpack (const UnpackParams &params, const Shape &input_shape, const Scalar *input_data, const Shape &output_shape, Scalar *const *output_datas)
 
template<typename T >
void PortableCwiseClipping (T *vector, const int v_size, const T clipping_value)
 
void PortableVectorBatchVectorAssign (const float *vector, int v_size, int n_batch, float *batch_vector)
 
void PortableVectorBatchVectorAdd (const float *vector, int v_size, int n_batch, float *batch_vector)
 
bool PortableIsZeroVector (const float *vector, int v_size)
 
void PortableApplyActivationToVector (const float *vector, int v_size, FusedActivationFunctionType activation, float *result)
 
void PortableSub1Vector (const float *vector, int v_size, float *result)
 
void PortableSymmetricQuantizeFloats (const float *values, const int size, int8_t *quantized_values, float *min_value, float *max_value, float *scaling_factor)
 
void PortableAsymmetricQuantizeFloats (const float *values, const int size, int8_t *quantized_values, float *scaling_factor, int32_t *offset)
 
void PortableMatrixBatchVectorMultiplyAccumulate (const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *__restrict__ vectors, const float *scaling_factors, int n_batch, float *__restrict__ result, int result_stride)
 
void PortableMatrixBatchVectorMultiplyAccumulate (const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *__restrict__ vector, const float *scaling_factors, int n_batch, int32_t *, float *__restrict__ result, int result_stride, ruy::Context *)
 
void PortableMatrixBatchVectorMultiplyAccumulate (const float *matrix, int m_rows, int m_cols, const float *vector, int n_batch, float *result, int result_stride)
 
void PortableMeanStddevNormalization (const float *input_vector, float *output_vector, int v_size, int n_batch)
 
void PortableZeroVector (float *vector, int v_size)
 
int MatchingDim (const Shape &shape1, int index1, const Shape &shape2, int index2)
 
template<typename... Args>
int MatchingDim (const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args)
 
Shape GetShape (const std::vector< int32_t > &data)
 
int Offset (const Shape &shape, int i0, int i1, int i2, int i3)
 
int Offset (const Shape &shape, int *index)
 
int FlatSizeSkipDim (const Shape &shape, int skip_dim)
 
template<typename... Ts>
bool checkMatching (const Shape &shape, Ts... check_shapes)
 
template<typename... Ts>
int MatchingFlatSize (const Shape &shape, Ts... check_shapes)
 
int MatchingFlatSizeSkipDim (const Shape &shape, int skip_dim, const Shape &check_shape_0)
 
int MatchingFlatSizeSkipDim (const Shape &shape, int skip_dim, const Shape &check_shape_0, const Shape &check_shape_1)
 
int MatchingElementsSize (const Shape &shape, const Shape &check_shape_0, const Shape &check_shape_1)
 
ShapeIterator begin (const Shape &s)
 
ShapeIterator end (const Shape &s)
 
void CwiseClipping (float *vector, const int v_size, const float clipping_value)
 
void VectorBatchVectorAdd (const float *vector, int v_size, int n_batch, float *batch_vector)
 
void VectorBatchVectorAssign (const float *vector, int v_size, int n_batch, float *batch_vector)
 
template<typename T >
void VectorVectorCwiseProduct (const T *__restrict__ vector1, const T *__restrict__ vector2, int v_size, T *__restrict__ result)
 
template<typename T >
void VectorVectorCwiseProductAccumulate (const T *__restrict__ vector1, const T *__restrict__ vector2, int v_size, T *__restrict__ result)
 
template<typename T >
void VectorBatchVectorCwiseProduct (const T *vector, int v_size, const T *batch_vector, int n_batch, T *result)
 
template<typename T >
void VectorBatchVectorCwiseProductAccumulate (const T *vector, int v_size, const T *batch_vector, int n_batch, T *result)
 
bool IsZeroVector (const float *vector, int v_size)
 
void ApplyActivationToVector (const float *vector, int v_size, FusedActivationFunctionType activation, float *result)
 
void Sub1Vector (const float *vector, int v_size, float *result)
 
void SymmetricQuantizeFloats (const float *values, const int size, int8_t *quantized_values, float *min, float *max, float *scaling_factor)
 
void MatrixBatchVectorMultiplyAccumulate (const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vector, const float *scaling_factors, int n_batch, float *result, int result_stride)
 
void MatrixBatchVectorMultiplyAccumulate (const float *matrix, int m_rows, int m_cols, const float *vector, int n_batch, float *result, int result_stride)
 
void MatrixBatchVectorMultiplyAccumulate (const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vectors, const float *scaling_factors, int n_batch, int32_t *scratch, float *result, int result_stride, ruy::Context *ruy_context)
 
void MeanStddevNormalization (const float *input_vector, float *output_vector, int v_size, int n_batch)
 
void ZeroVector (float *vector, int v_size)
 
template<typename AccumScalar , typename DstScalar , QuantizationFlavor quantization_flavor>
void ValidateGemmParams (const GemmParams< AccumScalar, DstScalar, quantization_flavor > &params)
 
template<typename T >
ActivationFunctionWithMinMax (T x, T output_activation_min, T output_activation_max)
 
void QuantizeMultiplier (double double_multiplier, int32_t *quantized_multiplier, int *shift)
 
void QuantizeMultiplierSmallerThanOneExp (double double_multiplier, int32_t *quantized_multiplier, int *left_shift)
 
int32_t MultiplyByQuantizedMultiplier (int32_t x, int32_t quantized_multiplier, int shift)
 
int32_t MultiplyByQuantizedMultiplierGreaterThanOne (int32_t x, int32_t quantized_multiplier, int left_shift)
 
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp (int32_t x, int32_t quantized_multiplier, int left_shift)
 
int NodeOffset (int b, int h, int w, int height, int width)
 
int CountLeadingZeros (uint32_t integer_input)
 
void GetInvSqrtQuantizedMultiplierExp (int32_t input, int reverse_shift, int32_t *output_inv_sqrt, int *output_shift)
 
int SubscriptToIndex (const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
 
template<int N>
int SubscriptToIndexGeneric (const NdArrayDesc< N > *desc, int *iter)
 
template<int N>
void CopyDimsToDesc (const Shape &input_shape, NdArrayDesc< N > *desc_out)
 
template<int N>
void NdArrayDescsForElementwiseBroadcast (const Shape &input0_shape, const Shape &input1_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
 
template<int N>
void NdArrayDescsForElementwiseBroadcast (const Shape &input0_shape, const Shape &input1_shape, const Shape &input2_shape, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out, NdArrayDesc< N > *desc2_out)
 
bool NextIndex (const int num_dims, const int *dims, int *current)
 
size_t ReducedOutputOffset (const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
 
template<typename T >
void optimized_ops_preload_l1_keep (const T *ptr)
 
std::ostream & operator<< (std::ostream &os, const Shape &shape)
 

Typedef Documentation

◆ ComparisonFn

template<typename T >
using nnfw::cker::ComparisonFn = typedef bool (*)(T, T)

Definition at line 37 of file Comparison.h.

◆ CPUDevice

typedef Eigen::ThreadPoolDevice nnfw::cker::CPUDevice

Definition at line 51 of file RandomOpCpu.h.

◆ Index32

typedef TTypes<float,1>::Tensor32Bit::Index nnfw::cker::Index32

Definition at line 86 of file Tensor.h.

◆ LabelCounts

using nnfw::cker::LabelCounts = typedef std::vector<int32_t>

Definition at line 109 of file Einsum.h.

◆ Labels

using nnfw::cker::Labels = typedef std::vector<int32_t>

Definition at line 107 of file Einsum.h.

◆ LabelToDimSizes

using nnfw::cker::LabelToDimSizes = typedef std::vector<int32_t>

Definition at line 111 of file Einsum.h.

◆ MatrixMap

template<typename Scalar >
using nnfw::cker::MatrixMap = typedef typename std::conditional< std::is_const<Scalar>::value, Eigen::Map< const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic> >, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> >>::type

Definition at line 53 of file Utils.h.

◆ OperandLabelCounts

using nnfw::cker::OperandLabelCounts = typedef std::vector<LabelCounts>

Definition at line 110 of file Einsum.h.

◆ OperandLabels

using nnfw::cker::OperandLabels = typedef std::vector<Labels>

Definition at line 108 of file Einsum.h.

◆ ShapeVec

using nnfw::cker::ShapeVec = typedef std::vector<int32_t>

Definition at line 106 of file Einsum.h.

◆ VectorMap

template<typename Scalar >
using nnfw::cker::VectorMap = typedef typename std::conditional< std::is_const<Scalar>::value, Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1> >, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1> >>::type

Definition at line 38 of file Utils.h.

Enumeration Type Documentation

◆ BinaryArithmeticOpType

Enumerator
ADD 
SUB 
MUL 
DIV 
POW 

Definition at line 47 of file Types.h.

48{
49 ADD = 0,
50 SUB = 1,
51 MUL = 2,
52 DIV = 3,
53 POW = 4,
54};

◆ BroadcastableOpCategory

enum class nnfw::cker::BroadcastableOpCategory : uint8_t
strong
Enumerator
kNone 
kNonBroadcast 
kFirstInputBroadcastsFast 
kSecondInputBroadcastsFast 
kGenericBroadcast 

Definition at line 78 of file Types.h.

◆ CachePolicy

enum class nnfw::cker::CachePolicy : std::uint8_t
strong
Enumerator
kNeverCache 
kCacheIfLargeSpeedup 
kAlwaysCache 

Definition at line 425 of file Types.h.

◆ ComparisonOpType

enum class nnfw::cker::ComparisonOpType
strong
Enumerator
Equal 
NotEqual 
Greater 
GreaterEqual 
Less 
LessEqual 

Definition at line 56 of file Types.h.

◆ DimensionType

Enumerator
kBroadcasting 
kBatch 
kFree 
kContract 
kReduce 

Definition at line 116 of file Einsum.h.

117{
118 // Batch dimensions are those present in two inputs as well as the output.
119 // They are part of the batch dimensions during Tensor contraction.
120 // Such dimensions may be broadcasting dimensions (those mapping to
121 // ellipsis)
122 // or explicit batch dimensions corresponding to named axis labels.
123 kBroadcasting = 0,
124 kBatch = 1,
125 // Free dimensions are present in exactly one of the inputs, and also the
126 // output. These are non-contracted axes in the Tensor contraction.
127 kFree = 2,
128 // Contract dimensions are present in two inputs, but not the output. These
129 // dimensions are contracted in Tensor contraction.
130 kContract = 3,
131 // Reduce dimensions are present in exactly one input; and not in the output
132 // and are summed over prior to Tensor contraction.
133 kReduce = 4,
134};
@ kBroadcasting
Definition Einsum.h:123
@ kContract
Definition Einsum.h:130

◆ FusedActivationFunctionType

Enumerator
kNone 
kRelu6 
kRelu1 
kRelu 
kTanh 
kSigmoid 

Definition at line 31 of file Types.h.

◆ LSTMKernelType

Enumerator
kTfLiteLSTMFullKernel 
kTfLiteLSTMBasicKernel 

Definition at line 284 of file Types.h.

285{
288};
@ kTfLiteLSTMFullKernel
Definition Types.h:286
@ kTfLiteLSTMBasicKernel
Definition Types.h:287

◆ Order

enum class nnfw::cker::Order
strong
Enumerator
kColMajor 
kRowMajor 

Definition at line 419 of file Types.h.

◆ PaddingType

enum class nnfw::cker::PaddingType
strong
Enumerator
kNone 
kSame 
kValid 

Definition at line 40 of file Types.h.

41{
42 kNone = 0,
43 kSame = 1,
44 kValid = 2,
45};

◆ QuantizationFlavor

enum class nnfw::cker::QuantizationFlavor
strong
Enumerator
kFloatingPoint 
kIntegerWithUniformMultiplier 
kIntegerWithPerRowMultiplier 

Definition at line 474 of file Types.h.

475{
476 // Floating-point Gemm: the accumulators are not multiplied by any
477 // 'multiplier'.
479 // Quantized Gemm using a single multiplier for all accumulators.
481 // Quantized Gemm using a separate multipliers for accumulators of each
482 // row of the destination matrix. This is what is called 'per-channel'
483 // in GemmParams. Here we use the more specific 'per-row' terminology
484 // to allow for the possibility of 'per-column' in the future, and to
485 // allow for that to be a separate code path in some back-end such as
486 // gemmlowp.
488};

◆ RoPEMode

enum class nnfw::cker::RoPEMode
strong
Enumerator
kGptNeox 
kGptJ 

Definition at line 66 of file Types.h.

67{
68 kGptNeox = 0,
69 kGptJ = 1,
70};

Function Documentation

◆ Abs()

void nnfw::cker::Abs ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 51 of file Elementwise.h.

53{
54 auto input_map = MapAsVector(input_data, input_shape);
55 auto output_map = MapAsVector(output_data, output_shape);
56 output_map.array() = input_map.array().abs();
57}
VectorMap< Scalar > MapAsVector(Scalar *data, const Dims< N > &dims)
Definition Matrix.h:40
const luci_interpreter::RuntimeShape output_shape

References MapAsVector(), and output_shape.

◆ ActivationFunctionWithMinMax()

◆ AddN()

template<typename T >
void nnfw::cker::AddN ( const Shape input_shape,
const size_t  num_inputs,
const T **  input_data,
T *  output_data 
)

Definition at line 29 of file AddN.h.

30{
31 const size_t size = input_shape.FlatSize();
32 for (size_t i = 0; i < size; ++i)
33 {
34 T x = 0;
35 for (size_t j = 0; j < num_inputs; ++j)
36 {
37 x += input_data[j][i];
38 }
39 output_data[i] = x;
40 }
41}
int FlatSize() const
Definition Shape.h:181
int32_t size[5]
Definition Slice.cpp:35

References nnfw::cker::Shape::FlatSize(), and size.

◆ ApplyActivationToVector()

void nnfw::cker::ApplyActivationToVector ( const float *  vector,
int  v_size,
FusedActivationFunctionType  activation,
float *  result 
)
inline

Definition at line 109 of file TensorUtils.h.

111{
112 PortableApplyActivationToVector(vector, v_size, activation, result);
113}
void PortableApplyActivationToVector(const float *vector, int v_size, FusedActivationFunctionType activation, float *result)

References PortableApplyActivationToVector().

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().

◆ ArgMinMax()

template<typename T1 , typename T2 , typename Cmp >
void nnfw::cker::ArgMinMax ( const Shape input1_shape,
const T1 *  input1_data,
const Shape output_shape,
T2 *  output_data,
int32_t  axis,
const Cmp &  cmp 
)

Definition at line 29 of file ArgMinMax.h.

32{
33 assert(input1_shape.DimensionsCount() > 0);
34 assert(input1_shape.DimensionsCount() - 1 == output_shape.DimensionsCount());
35 if (axis < 0)
36 {
37 axis += input1_shape.DimensionsCount();
38 }
39 const int axis_size = input1_shape.Dims(axis);
40
41 int outer_size = 1;
42 for (int i = 0; i < axis; ++i)
43 {
44 assert(input1_shape.Dims(i) == output_shape.Dims(i));
45 outer_size *= input1_shape.Dims(i);
46 }
47
48 int inner_size = 1;
49 const int dims_count = input1_shape.DimensionsCount();
50 for (int i = axis + 1; i < dims_count; ++i)
51 {
52 assert(input1_shape.Dims(i) == output_shape.Dims(i - 1));
53 inner_size *= input1_shape.Dims(i);
54 }
55 for (int outer = 0; outer < outer_size; ++outer)
56 {
57 for (int inner = 0; inner < inner_size; ++inner)
58 {
59 auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
60 T2 min_max_index = 0;
61 for (int i = 1; i < axis_size; ++i)
62 {
63 const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner];
64 if (cmp(curr_value, min_max_value))
65 {
66 min_max_value = curr_value;
67 min_max_index = static_cast<T2>(i);
68 }
69 }
70 output_data[outer * inner_size + inner] = min_max_index;
71 }
72 }
73}
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ AveragePool()

template<typename T >
void nnfw::cker::AveragePool ( const PoolParams ,
const Shape ,
const T *  ,
const Shape ,
T *   
)

Definition at line 36 of file AveragePool.h.

37{
38 static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
39 "cker::MaxPool : This function supports only integer or floating point");
40 throw std::runtime_error("cker::AveragePool : Unsupported data type");
41}

◆ AveragePool16()

void nnfw::cker::AveragePool16 ( const PoolParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)
inline

Definition at line 106 of file AveragePool.h.

109{
110 // Here, and in other pooling ops, in order to maintain locality of reference,
111 // to minimize some recalculations, and to load into NEON vector registers, we
112 // use an inner loop down the depth. Since depths can be large and hence we
113 // would need arbitrarily large temporary storage, we divide the work up into
114 // depth tranches just within the batch loop.
115 static constexpr int kPoolingAccTrancheSize = 256;
116
117 assert(params.quantized_activation_min <= params.quantized_activation_max);
118 assert(input_shape.DimensionsCount() == 4);
119 assert(output_shape.DimensionsCount() == 4);
120 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
121 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
122 const int input_height = input_shape.Dims(1);
123 const int input_width = input_shape.Dims(2);
124 const int output_height = output_shape.Dims(1);
125 const int output_width = output_shape.Dims(2);
126 const int stride_height = params.stride_height;
127 const int stride_width = params.stride_width;
128
129 uint16_t acc[kPoolingAccTrancheSize];
130 for (int batch = 0; batch < batches; ++batch)
131 {
132 // We proceed through the depth in tranches (see comment above). The
133 // depth_base is the depth at the beginning of the tranche. The
134 // tranche_depth is the depth dimension of the tranche.
135 for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
136 {
137 const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
138 for (int out_y = 0; out_y < output_height; ++out_y)
139 {
140 for (int out_x = 0; out_x < output_width; ++out_x)
141 {
142 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
143 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
144 const int filter_x_start = std::max(0, -in_x_origin);
145 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
146 const int filter_y_start = std::max(0, -in_y_origin);
147 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
148 const int filter_count =
149 (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
150 memset(acc, 0, tranche_depth * sizeof(acc[0]));
151 const uint8_t *input_ptr =
152 input_data + depth_base +
153 depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
154 for (int fy = filter_y_start; fy < filter_y_end; fy++)
155 {
156 const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
157 for (int fx = filter_x_start; fx < filter_x_end; fx++)
158 {
159 const uint8_t *input_channel_ptr = input_row_ptr;
160 int channel = 0;
161#ifdef USE_NEON
162 for (; channel <= tranche_depth - 16; channel += 16)
163 {
164 uint16x8_t acc_reg[2];
165 for (int i = 0; i < 2; i++)
166 {
167 acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
168 }
169 uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
170 input_channel_ptr += 16;
171 acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
172 acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
173 for (int i = 0; i < 2; i++)
174 {
175 vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
176 }
177 }
178 for (; channel <= tranche_depth - 8; channel += 8)
179 {
180 uint16x8_t acc_reg = vld1q_u16(acc + channel);
181 uint8x8_t input_reg = vld1_u8(input_channel_ptr);
182 input_channel_ptr += 8;
183 acc_reg = vaddw_u8(acc_reg, input_reg);
184 vst1q_u16(acc + channel, acc_reg);
185 }
186#endif
187 for (; channel < tranche_depth; ++channel)
188 {
189 acc[channel] += *input_channel_ptr++;
190 }
191 input_row_ptr += depth;
192 }
193 }
194 uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
195 int channel = 0;
196#ifdef USE_NEON
197#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
198 if (filter_count == FILTER_COUNT) \
199 { \
200 for (; channel <= tranche_depth - 8; channel += 8) \
201 { \
202 uint16_t buf[8]; \
203 for (int i = 0; i < 8; i++) \
204 { \
205 buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
206 } \
207 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
208 buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
209 buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
210 vst1_u8(output_ptr + channel, buf8); \
211 } \
212 }
213 AVGPOOL_DIVIDING_BY(9)
214 AVGPOOL_DIVIDING_BY(15)
215#undef AVGPOOL_DIVIDING_BY
216 for (; channel <= tranche_depth - 8; channel += 8)
217 {
218 uint16_t buf[8];
219 for (int i = 0; i < 8; i++)
220 {
221 buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
222 }
223 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
224 buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
225 buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
226 vst1_u8(output_ptr + channel, buf8);
227 }
228#endif
229 for (; channel < tranche_depth; ++channel)
230 {
231 uint8_t a = (acc[channel] + filter_count / 2) / filter_count;
232 a = std::max<uint16_t>(a, params.quantized_activation_min);
233 a = std::min<uint16_t>(a, params.quantized_activation_max);
234 output_ptr[channel] = static_cast<uint8_t>(a);
235 }
236 }
237 }
238 }
239 }
240}
int Offset(const Dims< 4 > &dims, int i0, int i1, int i2, int i3)
Definition Dims.h:64
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
int32_t quantized_activation_min
Definition Types.h:95
int32_t quantized_activation_max
Definition Types.h:96
PaddingValues padding_values
Definition Types.h:89

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by AveragePool< uint8_t >().

◆ AveragePool32()

void nnfw::cker::AveragePool32 ( const PoolParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)
inline

Definition at line 242 of file AveragePool.h.

245{
246
247 // Here, and in other pooling ops, in order to maintain locality of reference,
248 // to minimize some recalculations, and to load into NEON vector registers, we
249 // use an inner loop down the depth. Since depths can be large and hence we
250 // would need arbitrarily large temporary storage, we divide the work up into
251 // depth tranches just within the batch loop.
252 static constexpr int kPoolingAccTrancheSize = 256;
253
254 assert(params.quantized_activation_min <= params.quantized_activation_max);
255 assert(input_shape.DimensionsCount() == 4);
256 assert(output_shape.DimensionsCount() == 4);
257 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
258 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
259 const int input_height = input_shape.Dims(1);
260 const int input_width = input_shape.Dims(2);
261 const int output_height = output_shape.Dims(1);
262 const int output_width = output_shape.Dims(2);
263 const int stride_height = params.stride_height;
264 const int stride_width = params.stride_width;
265
266 uint32_t acc[kPoolingAccTrancheSize];
267 for (int batch = 0; batch < batches; ++batch)
268 {
269 // We proceed through the depth in tranches (see comment above). The
270 // depth_base is the depth at the beginning of the tranche. The
271 // tranche_depth is the depth dimension of the tranche.
272 for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
273 {
274 const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
275 for (int out_y = 0; out_y < output_height; ++out_y)
276 {
277 for (int out_x = 0; out_x < output_width; ++out_x)
278 {
279 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
280 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
281 const int filter_x_start = std::max(0, -in_x_origin);
282 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
283 const int filter_y_start = std::max(0, -in_y_origin);
284 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
285 const int filter_count =
286 (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
287 memset(acc, 0, tranche_depth * sizeof(acc[0]));
288 const uint8_t *input_ptr =
289 input_data + depth_base +
290 depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
291 for (int fy = filter_y_start; fy < filter_y_end; fy++)
292 {
293 const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
294 for (int fx = filter_x_start; fx < filter_x_end; fx++)
295 {
296 const uint8_t *input_channel_ptr = input_row_ptr;
297 int channel = 0;
298#ifdef USE_NEON
299 for (; channel <= tranche_depth - 16; channel += 16)
300 {
301 uint16x4_t acc_reg[4];
302 uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
303 input_channel_ptr += 16;
304 acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
305 acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
306 acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
307 acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
308 for (int i = 0; i < 4; i++)
309 {
310 vst1q_u32(acc + channel + 4 * i,
311 vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
312 }
313 }
314 for (; channel <= tranche_depth - 8; channel += 8)
315 {
316 uint16x4_t acc_reg[2];
317 uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
318 input_channel_ptr += 8;
319 acc_reg[0] = vget_low_u16(input_reg);
320 acc_reg[1] = vget_high_u16(input_reg);
321 for (int i = 0; i < 2; i++)
322 {
323 vst1q_u32(acc + channel + 4 * i,
324 vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
325 }
326 }
327#endif
328 for (; channel < tranche_depth; ++channel)
329 {
330 acc[channel] += *input_channel_ptr++;
331 }
332 input_row_ptr += depth;
333 }
334 }
335 uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
336 int channel = 0;
337#ifdef USE_NEON
338#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \
339 if (filter_count == FILTER_COUNT) \
340 { \
341 for (; channel <= tranche_depth - 8; channel += 8) \
342 { \
343 uint16_t buf[8]; \
344 for (int i = 0; i < 8; i++) \
345 { \
346 buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
347 } \
348 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \
349 buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
350 buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
351 vst1_u8(output_ptr + channel, buf8); \
352 } \
353 }
354 AVGPOOL_DIVIDING_BY(9)
355 AVGPOOL_DIVIDING_BY(15)
356#undef AVGPOOL_DIVIDING_BY
357 for (; channel <= tranche_depth - 8; channel += 8)
358 {
359 uint16_t buf[8];
360 for (int i = 0; i < 8; i++)
361 {
362 buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
363 }
364 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
365 buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
366 buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
367 vst1_u8(output_ptr + channel, buf8);
368 }
369#endif
370 for (; channel < tranche_depth; ++channel)
371 {
372 uint16_t a = (acc[channel] + filter_count / 2) / filter_count;
373 a = std::max<uint16_t>(a, params.quantized_activation_min);
374 a = std::min<uint16_t>(a, params.quantized_activation_max);
375 output_ptr[channel] = static_cast<uint8_t>(a);
376 }
377 }
378 }
379 }
380 }
381}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by AveragePool< uint8_t >().

◆ AveragePool< float >()

template<>
void nnfw::cker::AveragePool< float > ( const PoolParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 44 of file AveragePool.h.

46{
47 assert(input_shape.DimensionsCount() == 4);
48 assert(output_shape.DimensionsCount() == 4);
49 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
50 const int input_height = input_shape.Dims(1);
51 const int input_width = input_shape.Dims(2);
52 const int output_height = output_shape.Dims(1);
53 const int output_width = output_shape.Dims(2);
54 const int stride_height = params.stride_height;
55 const int stride_width = params.stride_width;
56
57 // TODO(benoitjacob) make this a proper reference impl without Eigen!
58 const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
59 auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
60 // TODO(benoitjacob) get rid of the dynamic memory allocation here!
61 Eigen::VectorXf out_count(out_mat.cols());
62 out_count.setZero();
63 // Prefill the output to 0.
64 out_mat.setZero();
65 for (int b = 0; b < batches; ++b)
66 {
67 for (int h = 0; h < input_height; ++h)
68 {
69 for (int w = 0; w < input_width; ++w)
70 {
71 // (h_start, h_end) * (w_start, w_end) is the range that the input
72 // vector projects to.
73 int hpad = h + params.padding_values.height;
74 int wpad = w + params.padding_values.width;
75 int h_start =
76 (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
77 int h_end = std::min(hpad / stride_height + 1, output_height);
78 int w_start =
79 (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
80 int w_end = std::min(wpad / stride_width + 1, output_width);
81 // compute elementwise sum
82 for (int ph = h_start; ph < h_end; ++ph)
83 {
84 for (int pw = w_start; pw < w_end; ++pw)
85 {
86 int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
87 out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width));
88 out_count(out_offset)++;
89 }
90 }
91 }
92 }
93 }
94 // Divide the output by the actual number of elements being averaged over
95 assert(out_count.minCoeff() > 0);
96 out_mat.array().rowwise() /= out_count.transpose().array();
97
98 const int flat_size = output_shape.FlatSize();
99 for (int i = 0; i < flat_size; ++i)
100 {
102 params.float_activation_max);
103 }
104}
int NodeOffset(int b, int h, int w, int height, int width)
Definition FeatureMap.h:21
MatrixMap< Scalar > MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
Definition Utils.h:60
T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
Definition Utils.h:43
float float_activation_max
Definition Types.h:99
float float_activation_min
Definition Types.h:98

References ActivationFunctionWithMinMax(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PoolParams::float_activation_max, nnfw::cker::PoolParams::float_activation_min, nnfw::cker::PaddingValues::height, MapAsMatrixWithLastDimAsRows(), MatchingDim(), NodeOffset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ AveragePool< int8_t >()

template<>
void nnfw::cker::AveragePool< int8_t > ( const PoolParams params,
const Shape input_shape,
const int8_t *  input_data,
const Shape output_shape,
int8_t *  output_data 
)

Definition at line 399 of file AveragePool.h.

401{
402 // Here, and in other pooling ops, in order to maintain locality of reference,
403 // to minimize some recalculations, and to load into NEON vector registers, we
404 // use an inner loop down the depth. Since depths can be large and hence we
405 // would need arbitrarily large temporary storage, we divide the work up into
406 // depth tranches just within the batch loop.
407 static constexpr int kPoolingAccTrancheSize = 256;
408
409 assert(params.quantized_activation_min <= params.quantized_activation_max);
410 assert(input_shape.DimensionsCount() == 4);
411 assert(output_shape.DimensionsCount() == 4);
412 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
413 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
414 const int input_height = input_shape.Dims(1);
415 const int input_width = input_shape.Dims(2);
416 const int output_height = output_shape.Dims(1);
417 const int output_width = output_shape.Dims(2);
418 const int stride_height = params.stride_height;
419 const int stride_width = params.stride_width;
420
421 int32_t acc[kPoolingAccTrancheSize];
422 for (int batch = 0; batch < batches; ++batch)
423 {
424 // We proceed through the depth in tranches (see comment above). The
425 // depth_base is the depth at the beginning of the tranche. The
426 // tranche_depth is the depth dimension of the tranche.
427 for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
428 {
429 const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
430 for (int out_y = 0; out_y < output_height; ++out_y)
431 {
432 for (int out_x = 0; out_x < output_width; ++out_x)
433 {
434 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
435 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
436 const int filter_x_start = std::max(0, -in_x_origin);
437 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
438 const int filter_y_start = std::max(0, -in_y_origin);
439 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
440 const int filter_count =
441 (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
442 memset(acc, 0, tranche_depth * sizeof(acc[0]));
443 const int8_t *input_ptr =
444 input_data + depth_base +
445 depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
446 for (int fy = filter_y_start; fy < filter_y_end; fy++)
447 {
448 const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
449 for (int fx = filter_x_start; fx < filter_x_end; fx++)
450 {
451 const int8_t *input_channel_ptr = input_row_ptr;
452 int channel = 0;
453#ifdef USE_NEON
454 for (; channel <= tranche_depth - 16; channel += 16)
455 {
456 int16x4_t acc_reg[4];
457 int8x16_t input_reg = vld1q_s8(input_channel_ptr);
458 input_channel_ptr += 16;
459 acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
460 acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
461 acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
462 acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
463 for (int i = 0; i < 4; i++)
464 {
465 vst1q_s32(acc + channel + 4 * i,
466 vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
467 }
468 }
469 for (; channel <= tranche_depth - 8; channel += 8)
470 {
471 int16x4_t acc_reg[2];
472 int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
473 input_channel_ptr += 8;
474 acc_reg[0] = vget_low_s16(input_reg);
475 acc_reg[1] = vget_high_s16(input_reg);
476 for (int i = 0; i < 2; i++)
477 {
478 vst1q_s32(acc + channel + 4 * i,
479 vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
480 }
481 }
482#endif
483 for (; channel < tranche_depth; ++channel)
484 {
485 acc[channel] += *input_channel_ptr++;
486 }
487 input_row_ptr += depth;
488 }
489 }
490 int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
491 int channel = 0;
492#ifdef USE_NEON
493 for (; channel <= tranche_depth - 8; channel += 8)
494 {
495 int16_t buf[8];
496 for (int i = 0; i < 8; i++)
497 {
498 buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
499 : (acc[channel + i] - filter_count / 2) / filter_count;
500 }
501 int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
502 buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
503 buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
504 vst1_s8(output_ptr + channel, buf8);
505 }
506#endif
507 for (; channel < tranche_depth; ++channel)
508 {
509 int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
510 : (acc[channel] - filter_count / 2) / filter_count;
511 a = std::max<int16_t>(a, params.quantized_activation_min);
512 a = std::min<int16_t>(a, params.quantized_activation_max);
513 output_ptr[channel] = static_cast<int8_t>(a);
514 }
515 }
516 }
517 }
518 }
519}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ AveragePool< uint8_t >()

template<>
void nnfw::cker::AveragePool< uint8_t > ( const PoolParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)

Definition at line 384 of file AveragePool.h.

387{
388 if (params.filter_height * params.filter_width > 16 * 16)
389 {
390 AveragePool32(params, input_shape, input_data, output_shape, output_data);
391 }
392 else
393 {
394 AveragePool16(params, input_shape, input_data, output_shape, output_data);
395 }
396}
void AveragePool32(const PoolParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)

References AveragePool16(), AveragePool32(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, and output_shape.

◆ BatchToSpaceND()

template<typename T >
void nnfw::cker::BatchToSpaceND ( const Shape unextended_input1_shape,
const T *  input1_data,
const int32_t *  block_shape_data,
const int32_t *  crops_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 50 of file BatchToSpaceND.h.

53{
54 auto input_dim = unextended_input1_shape.DimensionsCount();
55 auto output_dim = unextended_output_shape.DimensionsCount();
56
57 assert(input_dim == 3 || input_dim == 4);
58 assert(input_dim == output_dim);
59
60 UNUSED(input_dim);
61 UNUSED(output_dim);
62
63 // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
64 auto extend_shape = [](const Shape &shape) {
65 if (shape.DimensionsCount() == 4)
66 {
67 return shape;
68 }
69 Shape new_shape(4, 1);
70 new_shape.SetDim(0, shape.Dims(0));
71 new_shape.SetDim(1, shape.Dims(1));
72 new_shape.SetDim(3, shape.Dims(2));
73 return new_shape;
74 };
75 const Shape input1_shape = extend_shape(unextended_input1_shape);
76 const Shape output_shape = extend_shape(unextended_output_shape);
77
78 const int32_t output_width = output_shape.Dims(2);
79 const int32_t output_height = output_shape.Dims(1);
80 const int32_t output_batch_size = output_shape.Dims(0);
81
82 const int32_t depth = input1_shape.Dims(3);
83 const int32_t input_width = input1_shape.Dims(2);
84 const int32_t input_height = input1_shape.Dims(1);
85 const int32_t input_batch_size = input1_shape.Dims(0);
86
87 const int32_t block_shape_height = block_shape_data[0];
88 const int32_t block_shape_width = block_shape_data[1];
89
90 const int32_t crops_top = crops_data[0];
91 const int32_t crops_left = crops_data[2];
92
93 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
94 {
95 const int out_batch = in_batch % output_batch_size;
96 const int spatial_offset = in_batch / output_batch_size;
97
98 int in_h_start = 0;
99 int in_h_end = 0;
100 // GetIndexRange ensures start and end indices are in [0, output_height).
101 GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height,
102 output_height, &in_h_start, &in_h_end);
103
104 for (int in_h = in_h_start; in_h < in_h_end; ++in_h)
105 {
106 const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
107 assert(out_h >= 0);
108 assert(out_h < output_height);
109
110 int in_w_start = 0;
111 int in_w_end = 0;
112 // GetIndexRange ensures start and end indices are in [0, output_width).
113 GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width,
114 output_width, &in_w_start, &in_w_end);
115
116 for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
117 {
118 const int out_w =
119 in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
120 assert(out_w >= 0);
121 assert(out_w < output_width);
122 T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
123 const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
124 memcpy(out, in, depth * sizeof(T));
125 }
126 }
127 }
128}
#define UNUSED(x)
Definition Shape.h:28

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), GetIndexRange(), Offset(), output_shape, nnfw::cker::Shape::SetDim(), and UNUSED.

◆ begin()

ShapeIterator nnfw::cker::begin ( const Shape s)
inline

Definition at line 88 of file ShapeIterator.h.

◆ BiasAndClamp()

void nnfw::cker::BiasAndClamp ( float  clamp_min,
float  clamp_max,
int  bias_size,
const float *  bias_data,
int  array_size,
float *  array_data 
)
inline

Definition at line 29 of file Common.h.

31{
32 // Note: see b/132215220: in May 2019 we thought it would be OK to replace
33 // this with the Eigen one-liner:
34 // return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
35 // This turned out to severely regress performance: +4ms (i.e. 8%) on
36 // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
37 assert((array_size % bias_size) == 0);
38#ifdef USE_NEON
39 float *array_ptr = array_data;
40 float *array_end_ptr = array_ptr + array_size;
41 const auto clamp_min_vec = vdupq_n_f32(clamp_min);
42 const auto clamp_max_vec = vdupq_n_f32(clamp_max);
43 for (; array_ptr != array_end_ptr; array_ptr += bias_size)
44 {
45 int i = 0;
46 for (; i <= bias_size - 16; i += 16)
47 {
48 auto b0 = vld1q_f32(bias_data + i);
49 auto b1 = vld1q_f32(bias_data + i + 4);
50 auto b2 = vld1q_f32(bias_data + i + 8);
51 auto b3 = vld1q_f32(bias_data + i + 12);
52 auto a0 = vld1q_f32(array_ptr + i);
53 auto a1 = vld1q_f32(array_ptr + i + 4);
54 auto a2 = vld1q_f32(array_ptr + i + 8);
55 auto a3 = vld1q_f32(array_ptr + i + 12);
56 auto x0 = vaddq_f32(a0, b0);
57 auto x1 = vaddq_f32(a1, b1);
58 auto x2 = vaddq_f32(a2, b2);
59 auto x3 = vaddq_f32(a3, b3);
60 x0 = vmaxq_f32(clamp_min_vec, x0);
61 x1 = vmaxq_f32(clamp_min_vec, x1);
62 x2 = vmaxq_f32(clamp_min_vec, x2);
63 x3 = vmaxq_f32(clamp_min_vec, x3);
64 x0 = vminq_f32(clamp_max_vec, x0);
65 x1 = vminq_f32(clamp_max_vec, x1);
66 x2 = vminq_f32(clamp_max_vec, x2);
67 x3 = vminq_f32(clamp_max_vec, x3);
68 vst1q_f32(array_ptr + i, x0);
69 vst1q_f32(array_ptr + i + 4, x1);
70 vst1q_f32(array_ptr + i + 8, x2);
71 vst1q_f32(array_ptr + i + 12, x3);
72 }
73 for (; i <= bias_size - 4; i += 4)
74 {
75 auto b = vld1q_f32(bias_data + i);
76 auto a = vld1q_f32(array_ptr + i);
77 auto x = vaddq_f32(a, b);
78 x = vmaxq_f32(clamp_min_vec, x);
79 x = vminq_f32(clamp_max_vec, x);
80 vst1q_f32(array_ptr + i, x);
81 }
82 for (; i < bias_size; i++)
83 {
84 array_ptr[i] =
85 ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
86 }
87 }
88#else // not NEON
89 for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
90 {
91 for (int i = 0; i < bias_size; i++)
92 {
93 array_data[array_offset + i] = ActivationFunctionWithMinMax(
94 array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
95 }
96 }
97#endif
98}

References ActivationFunctionWithMinMax().

Referenced by nnfw::cker::optimized::AddBiasAndEvalActivationFunction(), and nnfw::cker::detail::GemmImplUsingEigen::Run().

◆ BinaryArithmeticOp() [1/4]

template<BinaryArithmeticOpType op_type>
void nnfw::cker::BinaryArithmeticOp ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 257 of file BinaryArithmeticOps.h.

261{
262 // Supported type is only float now
263 switch (op_type)
264 {
266 optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
267 output_data);
268 break;
270 optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
271 output_data);
272 break;
274 optimized::Sub(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
275 output_data);
276 break;
278 optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
279 output_data);
280 break;
281 default:
282 assert(false);
283 break;
284 }
285}

References nnfw::cker::optimized::Add(), ADD, nnfw::cker::optimized::Div(), DIV, nnfw::cker::optimized::Mul(), MUL, output_shape, nnfw::cker::optimized::Sub(), and SUB.

◆ BinaryArithmeticOp() [2/4]

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&!std::is_same< T, bool >::value > nnfw::cker::BinaryArithmeticOp ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 213 of file BinaryArithmeticOps.h.

216{
217 reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
218 output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
219}

References nnfw::cker::reference::BinaryArithmeticOp(), and output_shape.

◆ BinaryArithmeticOp() [3/4]

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&std::is_same< T, bool >::value > nnfw::cker::BinaryArithmeticOp ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 223 of file BinaryArithmeticOps.h.

226{
227 reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
228 output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
229}

References nnfw::cker::reference::BinaryArithmeticOp(), and output_shape.

◆ BinaryArithmeticOp() [4/4]

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::BinaryArithmeticOp ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 233 of file BinaryArithmeticOps.h.

236{
237 switch (op_type)
238 {
241 optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
242 output_data);
243 break;
245 optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
246 output_data);
247 break;
249 throw std::runtime_error{"Quant8 Asymm NYI"};
250 default:
251 assert(false);
252 break;
253 }
254}

References nnfw::cker::optimized::Add(), ADD, DIV, nnfw::cker::optimized::Mul(), MUL, output_shape, and SUB.

◆ BroadcastBinaryArithmeticOp() [1/3]

template<BinaryArithmeticOpType op_type>
void nnfw::cker::BroadcastBinaryArithmeticOp ( BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 325 of file BinaryArithmeticOps.h.

329{
330 if (output_shape.DimensionsCount() > 4)
331 throw std::runtime_error(
332 std::string("cker::BroadcastBinaryArithmeticOp: Unsupported rank size : ") +
333 std::to_string(output_shape.DimensionsCount()));
334
335 // Supported type is only float now
336 switch (op_type)
337 {
339 optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
340 output_shape, output_data);
341 break;
343 optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
344 output_shape, output_data);
345 break;
347 optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
348 output_shape, output_data);
349 break;
351 optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
352 output_shape, output_data);
353 break;
355 reference::BroadcastBinaryArithmeticOpSlow<float>(
356 params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
357 GetBinaryArtithmeticFn<op_type, float>());
358 break;
359 default:
360 assert(false);
361 break;
362 }
363}

References ADD, nnfw::cker::optimized::BroadcastAddDispatch(), nnfw::cker::optimized::BroadcastDivDispatch(), nnfw::cker::optimized::BroadcastMulDispatch(), nnfw::cker::optimized::BroadcastSubDispatch(), DIV, MUL, output_shape, POW, and SUB.

◆ BroadcastBinaryArithmeticOp() [2/3]

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value > nnfw::cker::BroadcastBinaryArithmeticOp ( BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 289 of file BinaryArithmeticOps.h.

292{
293 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
294 input2_data, output_shape, output_data,
295 GetBinaryArtithmeticFn<op_type, T>());
296}

References nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), and output_shape.

◆ BroadcastBinaryArithmeticOp() [3/3]

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::BroadcastBinaryArithmeticOp ( BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 300 of file BinaryArithmeticOps.h.

303{
304 switch (op_type)
305 {
308 optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
309 output_shape, output_data);
310 break;
312 optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
313 output_shape, output_data);
314 break;
317 throw std::runtime_error{"Quant8 Asymm NYI"};
318 default:
319 assert(false);
320 break;
321 }
322}

References ADD, nnfw::cker::optimized::BroadcastAddDispatch(), nnfw::cker::optimized::BroadcastMulDispatch(), DIV, MUL, output_shape, POW, and SUB.

◆ BroadcastComparison4DSlow()

template<typename T , ComparisonFn< T > F>
void nnfw::cker::BroadcastComparison4DSlow ( const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 123 of file Comparison.h.

126{
127 BroadcastComparison4DSlowImpl<T, F>(input1_shape, input1_data, input2_shape, input2_data,
128 output_shape, output_data);
129}

References output_shape.

◆ BroadcastComparison4DSlowImpl()

template<typename T , ComparisonFn< T > F>
void nnfw::cker::BroadcastComparison4DSlowImpl ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
bool *  output_data 
)
inline

Definition at line 91 of file Comparison.h.

94{
95 assert(unextended_input1_shape.DimensionsCount() <= 4);
96 assert(unextended_input2_shape.DimensionsCount() <= 4);
97 assert(unextended_output_shape.DimensionsCount() <= 4);
98 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
99
102 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
103 &desc2);
104
105 for (int b = 0; b < output_shape.Dims(0); ++b)
106 {
107 for (int y = 0; y < output_shape.Dims(1); ++y)
108 {
109 for (int x = 0; x < output_shape.Dims(2); ++x)
110 {
111 for (int c = 0; c < output_shape.Dims(3); ++c)
112 {
113 output_data[Offset(output_shape, b, y, x, c)] =
114 F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
115 input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
116 }
117 }
118 }
119 }
120}
void NdArrayDescsForElementwiseBroadcast(const Dims< N > &input0_dims, const Dims< N > &input1_dims, NdArrayDesc< N > *desc0_out, NdArrayDesc< N > *desc1_out)
Definition NDArray.h:89
int SubscriptToIndex(const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)
Definition NDArray.h:54
NdArrayDesc< 4 > desc1
NdArrayDesc< 4 > desc2

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastComparison4DSlowWithScaling()

template<typename T , ComparisonFn< int32_t > F>
void nnfw::cker::BroadcastComparison4DSlowWithScaling ( ComparisonParams params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 132 of file Comparison.h.

136{
137 assert(input1_shape.DimensionsCount() <= 4);
138 assert(input2_shape.DimensionsCount() <= 4);
139 assert(output_shape.DimensionsCount() <= 4);
140
141 int left_shift = params.left_shift;
142 int32_t input1_offset = params.input1_offset;
143 int32_t input1_multiplier = params.input1_multiplier;
144 int input1_shift = params.input1_shift;
145 int32_t input2_offset = params.input2_offset;
146 int32_t input2_multiplier = params.input2_multiplier;
147 int input2_shift = params.input2_shift;
148
151 NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
152
153 for (int b = 0; b < output_shape.Dims(0); ++b)
154 {
155 for (int y = 0; y < output_shape.Dims(1); ++y)
156 {
157 for (int x = 0; x < output_shape.Dims(2); ++x)
158 {
159 for (int c = 0; c < output_shape.Dims(3); ++c)
160 {
161 const int32_t input1_val =
162 input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
163 const int32_t input2_val =
164 input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
165 const int32_t shifted_input1_val = input1_val * (1 << left_shift);
166 const int32_t shifted_input2_val = input2_val * (1 << left_shift);
167 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
168 shifted_input1_val, input1_multiplier, input1_shift);
169 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
170 shifted_input2_val, input2_multiplier, input2_shift);
171 output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
172 }
173 }
174 }
175 }
176}
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:111

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::ComparisonParams::input1_multiplier, nnfw::cker::ComparisonParams::input1_offset, nnfw::cker::ComparisonParams::input1_shift, nnfw::cker::ComparisonParams::input2_multiplier, nnfw::cker::ComparisonParams::input2_offset, nnfw::cker::ComparisonParams::input2_shift, nnfw::cker::ComparisonParams::left_shift, MultiplyByQuantizedMultiplierSmallerThanOneExp(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastSelect4DSlow()

template<typename D , typename T >
void nnfw::cker::BroadcastSelect4DSlow ( const Shape input_condition_shape,
const D *  input_condition_data,
const Shape input_x_shape,
const T *  input_x_data,
const Shape input_y_shape,
const T *  input_y_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 63 of file Select.h.

67{
68 assert(input_condition_shape.DimensionsCount() <= 4);
69 assert(input_x_shape.DimensionsCount() <= 4);
70 assert(input_y_shape.DimensionsCount() <= 4);
71 assert(output_shape.DimensionsCount() <= 4);
72
73 const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
74
75 NdArrayDesc<4> desc_condition;
76 NdArrayDesc<4> desc_x;
77 NdArrayDesc<4> desc_y;
78 NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
79 &desc_condition, &desc_x, &desc_y);
80
81 // In Tensorflow, the dimensions are canonically named (batch_number, row,
82 // col, channel), with extents (batches, height, width, depth), with the
83 // trailing dimension changing most rapidly (channels has the smallest
84 // stride, typically 1 element).
85 //
86 // In generated C code, we store arrays with the dimensions reversed. The
87 // first dimension has smallest stride.
88 //
89 // We name our variables by their Tensorflow convention, but generate C code
90 // nesting loops such that the innermost loop has the smallest stride for
91 // the best cache behavior.
92 for (int b = 0; b < extended_output_shape.Dims(0); ++b)
93 {
94 for (int y = 0; y < extended_output_shape.Dims(1); ++y)
95 {
96 for (int x = 0; x < extended_output_shape.Dims(2); ++x)
97 {
98 for (int c = 0; c < extended_output_shape.Dims(3); ++c)
99 {
100 const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
101 const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
102 const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
103 output_data[Offset(extended_output_shape, b, y, x, c)] =
104 input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
105 }
106 }
107 }
108 }
109}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastTo()

template<typename T >
void nnfw::cker::BroadcastTo ( const Shape input_shape,
T *  input_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 132 of file BroadcastTo.h.

134{
135 const int input_flatsize = input_shape.FlatSize();
136
137 if (input_shape == output_shape)
138 {
139 memcpy(output_data, input_data, input_flatsize * sizeof(T));
140 return;
141 }
142
143 // Input shape's rank must be no greater than rank of output shape.
144 assert(input_shape.DimensionsCount() <= output_shape.DimensionsCount());
145
146 // It shouldn't be 0.
147 assert(output_shape.DimensionsCount());
148
149 Tensor output_tensor;
150 Tensor input_tensor;
151
152 input_tensor.shape.ReplaceWith(input_shape.DimensionsCount(), input_shape.DimsData());
153 input_tensor.buffer = input_data;
154
155 output_tensor.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
156 output_tensor.buffer = output_data;
157
158 const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
159
160 // Handle broadcast from Scalar.
161 if (input_flatsize == 0)
162 {
163 functor::FillFunctor<Eigen::ThreadPoolDevice, T>()(device, output_tensor.flat<T>(),
164 input_tensor.scalar<T>());
165 }
166
167 BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
168 /*fewer_dims_optimization=*/true);
169
170 // Predict TRUE.
171 assert(bcast.IsValid());
172 // should be same.
173 assert(BCast::ToShape(bcast.output_shape()) == output_shape);
174
175 functor::BroadcastTo<Eigen::ThreadPoolDevice, T>()(device, output_tensor, output_shape,
176 input_tensor, input_shape, bcast);
177}
int32_t * DimsData()
Definition Shape.h:112

References nnfw::cker::Tensor::buffer, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), nnfw::cker::Shape::FlatSize(), nnfw::cker::BCast::FromShape(), nnfw::cker::eigen_support::GetThreadPoolDevice(), nnfw::cker::BCastList< N >::IsValid(), nnfw::cker::BCast::output_shape(), output_shape, nnfw::cker::Shape::ReplaceWith(), nnfw::cker::Tensor::scalar(), nnfw::cker::Tensor::shape, and nnfw::cker::BCast::ToShape().

Referenced by nnfw::cker::train::BinaryArithmeticGrad(), and nnfw::cker::train::MeanGrad().

◆ buildStridedSliceParams()

template<typename T >
StridedSliceParams nnfw::cker::buildStridedSliceParams ( const T *  begin,
const T *  end,
const T *  strides,
const uint32_t  begin_mask,
const uint32_t  end_mask,
const uint32_t  shrink_axis_mask,
const uint8_t  rank 
)
inline

Definition at line 195 of file StridedSlice.h.

198{
199 StridedSliceParams op_params;
200 op_params.start_indices_count = rank;
201 op_params.stop_indices_count = rank;
202 op_params.strides_count = rank;
203
204 for (int i = 0; i < rank; ++i)
205 {
206 op_params.start_indices[i] = begin[i];
207 op_params.stop_indices[i] = end[i];
208 op_params.strides[i] = strides[i];
209
210 assert(op_params.strides[i] != 0);
211 }
212
213 op_params.begin_mask = begin_mask;
214 op_params.ellipsis_mask = 0; // NYI
215 op_params.end_mask = end_mask;
216 op_params.new_axis_mask = 0; // NYI
217 op_params.shrink_axis_mask = shrink_axis_mask;
218
219 assert(sizeof(op_params.begin_mask) * 4 >= rank);
220
221 return op_params;
222}
ShapeIterator end(const Shape &s)
int32_t begin[5]
Definition Slice.cpp:33

References begin, nnfw::cker::StridedSliceParams::begin_mask, nnfw::cker::StridedSliceParams::ellipsis_mask, end(), nnfw::cker::StridedSliceParams::end_mask, nnfw::cker::StridedSliceParams::new_axis_mask, nnfw::cker::StridedSliceParams::shrink_axis_mask, nnfw::cker::StridedSliceParams::start_indices, nnfw::cker::StridedSliceParams::start_indices_count, nnfw::cker::StridedSliceParams::stop_indices, nnfw::cker::StridedSliceParams::stop_indices_count, nnfw::cker::StridedSliceParams::strides, and nnfw::cker::StridedSliceParams::strides_count.

◆ CalculateLstmGateFloat()

void nnfw::cker::CalculateLstmGateFloat ( const float *  input,
const float *  input_to_gate_weights,
const float *  aux_input,
const float *  aux_input_to_gate_weights,
const float *  output_state,
const float *  recurrent_to_gate_weights,
const float *  cell_state,
const float *  cell_to_gate_weights,
const float *  layer_norm_coefficients,
const float *  gate_bias,
const int  n_batch,
const int  n_input,
const int  n_aux_input,
const int  n_output,
const int  n_cell,
const FusedActivationFunctionType  activation,
float *  gate,
const bool  is_input_all_zeros,
const bool  is_aux_input_all_zeros 
)
inline

Definition at line 62 of file LSTM.h.

72{
73 const bool use_peephole = (cell_to_gate_weights != nullptr);
74 const bool use_layer_norm = (layer_norm_coefficients != nullptr);
75
76 // Initialize scratch buffers with bias for regular lstm or initialize with
77 // zero for layer norm lstm.
78 if (use_layer_norm)
79 {
80 std::fill_n(gate, n_cell * n_batch, 0.0f);
81 }
82 else
83 {
84 VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
85 }
86 // For each batch and cell: compute input_weight * input.
87 // Skip if input is all zeros.
88 if (!is_input_all_zeros)
89 {
90 MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch,
91 gate, /*result_stride=*/1);
92 }
93 // For each batch and cell: compute aux_input_weight * aux_input.
94 // Skip if auxiliary input is not available or all zeros.
95 if (!is_aux_input_all_zeros)
96 {
97 MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
98 n_batch, gate, /*result_stride=*/1);
99 }
100 // For each batch and cell: compute recurrent_weight * output_state.
101 MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state,
102 n_batch, gate, /*result_stride=*/1);
103 // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
104 if (use_peephole)
105 {
106 VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch,
107 gate);
108 }
109 // Do layer normalization (if layer norm LSTM)
110 if (use_layer_norm)
111 {
112 MeanStddevNormalization(gate, gate, n_cell, n_batch);
113 VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate);
114 VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
115 }
116 // Apply activation
117 ApplyActivationToVector(gate, n_batch * n_cell, activation, gate);
118}
void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size, int n_batch)
void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector, int n_batch, T *result)
Definition TensorUtils.h:76
void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vector, const float *scaling_factors, int n_batch, float *result, int result_stride)
void ApplyActivationToVector(const float *vector, int v_size, FusedActivationFunctionType activation, float *result)
void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector)
Definition TensorUtils.h:39
void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size, const T *batch_vector, int n_batch, T *result)
Definition TensorUtils.h:92

References ApplyActivationToVector(), MatrixBatchVectorMultiplyAccumulate(), MeanStddevNormalization(), VectorBatchVectorAdd(), VectorBatchVectorAssign(), VectorBatchVectorCwiseProduct(), and VectorBatchVectorCwiseProductAccumulate().

Referenced by LstmStepFloat().

◆ CalculateLstmOutputFloat()

void nnfw::cker::CalculateLstmOutputFloat ( int  n_batch,
int  n_cell,
int  n_output,
const float *  cell_state,
const float *  output_gate,
FusedActivationFunctionType  activation,
const float *  projection_weights,
const float *  projection_bias,
const float  proj_clip,
float *  output_state,
float *  scratch 
)

Definition at line 183 of file LSTM.h.

187{
188 ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch);
189
190 // Define variable for 4th argument to avoid warning
191 // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
192 const float *cwise_product_rhs = scratch;
193 VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch);
194
195 const bool use_projection = (projection_weights != nullptr);
196 const bool use_projection_bias = (projection_bias != nullptr);
197
198 if (use_projection)
199 {
200 if (use_projection_bias)
201 {
202 VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state);
203 }
204 else
205 {
206 std::fill_n(output_state, n_batch * n_output, 0.0f);
207 }
208 MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch,
209 output_state, /*result_stride=*/1);
210 if (proj_clip > 0.0f)
211 {
212 CwiseClipping(output_state, n_batch * n_output, proj_clip);
213 }
214 }
215 else
216 {
217 std::copy_n(scratch, n_batch * n_output, output_state);
218 }
219}
void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2, int v_size, T *__restrict__ result)
Definition TensorUtils.h:52
void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)
Definition TensorUtils.h:44

References ApplyActivationToVector(), CwiseClipping(), MatrixBatchVectorMultiplyAccumulate(), VectorBatchVectorAssign(), and VectorVectorCwiseProduct().

Referenced by LstmStepFloat().

◆ checkMatching()

template<typename... Ts>
bool nnfw::cker::checkMatching ( const Shape shape,
Ts...  check_shapes 
)
inline

Definition at line 268 of file Shape.h.

269{
270 const Shape check_shapes_array[sizeof...(Ts)] = {std::forward<Ts>(check_shapes)...};
271 for (const auto &check_shape : check_shapes_array)
272 {
273 // Check matching of shapes except the case of that two shapes can be scalar
274 if (shape.DimensionsCount() > 1 || check_shape.DimensionsCount() > 1 || shape.FlatSize() != 1 ||
275 check_shape.FlatSize() != 1)
276 {
277 if (shape.DimensionsCount() != check_shape.DimensionsCount())
278 {
279 return false;
280 }
281 for (int i = 0; i < shape.DimensionsCount(); ++i)
282 {
283 if (shape.Dims(i) != check_shape.Dims(i))
284 {
285 return false;
286 }
287 }
288 }
289 }
290 return true;
291}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

Referenced by MatchingFlatSize().

◆ checkOutputSize()

void nnfw::cker::checkOutputSize ( const StridedSliceParams op_params,
const Shape input_shape,
const Shape output_shape,
uint32_t  rank 
)

Definition at line 224 of file StridedSlice.h.

226{
227 [[maybe_unused]] int32_t shape_size = 0;
228
229 for (uint32_t idx = 0; idx < rank; ++idx)
230 {
231 int32_t stride = op_params.strides[idx];
232 int32_t begin = StartForAxis(op_params, input_shape, idx);
233 int32_t end = StopForAxis(op_params, input_shape, idx, begin);
234
235 // When shrinking an axis, the end position does not matter (and can be
236 // incorrect when negative indexing is used, see Issue #19260). Always use
237 // begin + 1 to generate a length 1 slice, since begin has
238 // already been adjusted for negative indices by StartForAxis.
239 const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
240 if (shrink_axis)
241 {
242 end = begin + 1;
243 }
244
245 int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
246 dim_shape = dim_shape < 0 ? 0 : dim_shape;
247 if (!shrink_axis)
248 {
249 assert(output_shape.Dims(shape_size) == dim_shape);
250 shape_size++;
251 }
252 }
253
254 assert(output_shape.DimensionsCount() == shape_size);
255}
int StopForAxis(const StridedSliceParams &params, const Shape &input_shape, int axis, int start_for_axis)
int StartForAxis(const StridedSliceParams &params, const Shape &input_shape, int axis)

References begin, end(), output_shape, nnfw::cker::StridedSliceParams::shrink_axis_mask, StartForAxis(), StopForAxis(), and nnfw::cker::StridedSliceParams::strides.

◆ Clamp()

int nnfw::cker::Clamp ( const int  v,
const int  lo,
const int  hi 
)
inline

Definition at line 32 of file StridedSlice.h.

33{
34 assert(!(hi < lo));
35 if (hi < v)
36 return hi;
37 if (v < lo)
38 return lo;
39 return v;
40}

Referenced by StartForAxis(), and StopForAxis().

◆ Comparison()

template<ComparisonFn< float > F>
void nnfw::cker::Comparison ( const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 53 of file Comparison.h.

56{
57 ComparisonImpl<float, F>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
58 output_data);
59}

References output_shape.

◆ ComparisonImpl()

template<typename T , ComparisonFn< T > F>
void nnfw::cker::ComparisonImpl ( const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 40 of file Comparison.h.

43{
44 const int64_t flatsize = // number of data....
45 MatchingFlatSize(input1_shape, input2_shape, output_shape);
46 for (int64_t i = 0; i < flatsize; ++i)
47 {
48 output_data[i] = F(input1_data[i], input2_data[i]);
49 }
50}
int MatchingFlatSize(const Dims< N > &dims, const Dims< N > &check_dims_0)
Definition Dims.h:108

References MatchingFlatSize(), and output_shape.

◆ ComparisonWithScaling()

template<typename T , ComparisonFn< int32_t > F>
void nnfw::cker::ComparisonWithScaling ( ComparisonParams params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 62 of file Comparison.h.

66{
67 int left_shift = params.left_shift;
68 int32_t input1_offset = params.input1_offset;
69 int32_t input1_multiplier = params.input1_multiplier;
70 int input1_shift = params.input1_shift;
71 int32_t input2_offset = params.input2_offset;
72 int32_t input2_multiplier = params.input2_multiplier;
73 int input2_shift = params.input2_shift;
74 const int64_t flatsize = MatchingFlatSize(input1_shape, input2_shape, output_shape);
75 for (int64_t i = 0; i < flatsize; ++i)
76 {
77 const int32_t input1_val = input1_offset + input1_data[i];
78 const int32_t input2_val = input2_offset + input2_data[i];
79 const int32_t shifted_input1_val = input1_val * (1 << left_shift);
80 const int32_t shifted_input2_val = input2_val * (1 << left_shift);
81 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
82 shifted_input1_val, input1_multiplier, input1_shift);
83 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
84 shifted_input2_val, input2_multiplier, input2_shift);
85 output_data[i] = F(scaled_input1_val, scaled_input2_val);
86 }
87}

References nnfw::cker::ComparisonParams::input1_multiplier, nnfw::cker::ComparisonParams::input1_offset, nnfw::cker::ComparisonParams::input1_shift, nnfw::cker::ComparisonParams::input2_multiplier, nnfw::cker::ComparisonParams::input2_offset, nnfw::cker::ComparisonParams::input2_shift, nnfw::cker::ComparisonParams::left_shift, MatchingFlatSize(), MultiplyByQuantizedMultiplierSmallerThanOneExp(), and output_shape.

◆ ComputeBatchIndices()

void nnfw::cker::ComputeBatchIndices ( const int32_t  output_batch_size,
const std::vector< int32_t > &  reshape,
const std::vector< int32_t > &  bcast,
std::vector< int32_t > *  out_indices 
)
inline

Definition at line 40 of file BCast.h.

44{
45 // Populates the mapping in out_indices. This algorithm is identical to
46 // the following steps:
47 // - Reshape {0, 1, ..., input_batch_size - 1} to the input shape.
48 // - Broadcast to the output shape.
49 // - Reshape back to a flat 1D vector.
50 out_indices->resize(output_batch_size);
51 int32_t num_output_elements = 1;
52 int32_t num_input_elements = 1;
53 for (int32_t i = reshape.size() - 1; i >= 0; --i)
54 {
55 // Replicate the already populated mapping an additional (dim - 1) times.
56 // If we are broadcasting, just copy the existing mapping.
57 // Otherwise, add another dimension from the input shape.
58 const int32_t dim = std::max(reshape[i], bcast[i]);
59 const int32_t incr = bcast[i] > 1 ? 0 : num_input_elements;
60 for (int32_t k = 0; k < (dim - 1) * num_output_elements; ++k)
61 {
62 (*out_indices)[num_output_elements + k] = (*out_indices)[k] + incr;
63 }
64 num_output_elements *= dim;
65 num_input_elements *= reshape[i];
66 }
67}

Referenced by nnfw::cker::BCastList< N >::BCastList().

◆ ComputeInterpolationValues() [1/2]

void nnfw::cker::ComputeInterpolationValues ( const float  value,
const float  scale,
const bool  half_pixel_centers,
int32_t  input_size,
float *  scaled_value,
int32_t *  lower_bound,
int32_t *  upper_bound 
)
inline

Definition at line 100 of file ResizeBilinear.h.

104{
105 if (half_pixel_centers)
106 {
107 *scaled_value = (value + 0.5f) * scale - 0.5f;
108 }
109 else
110 {
111 *scaled_value = value * scale;
112 }
113 float scaled_value_floor = std::floor(*scaled_value);
114 *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
115 *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
116}

Referenced by ResizeBilinear(), ResizeBilinearGeneric(), and ResizeBilinearGenericSmallChannel().

◆ ComputeInterpolationValues() [2/2]

void nnfw::cker::ComputeInterpolationValues ( const int32_t  value,
const int32_t  scale_10,
const bool  half_pixel_centers,
int32_t  input_size,
int32_t *  scaled_value,
int32_t *  lower_bound,
int32_t *  upper_bound 
)
inline

Definition at line 268 of file ResizeBilinear.h.

272{
273 if (half_pixel_centers)
274 {
275 *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
276 }
277 else
278 {
279 *scaled_value = value * scale_10;
280 }
281 *lower_bound = std::max(*scaled_value / (1 << 10), 0);
282 *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
283}

◆ Concatenation()

template<typename Scalar >
void nnfw::cker::Concatenation ( const ConcatenationParams params,
const Shape *const *  input_shapes,
const Scalar *const *  input_data,
const Shape output_shape,
Scalar *  output_data 
)
inline

Definition at line 33 of file Concatenation.h.

36{
37 int axis = params.axis;
38 int inputs_count = params.inputs_count;
39 const int concat_dimensions = output_shape.DimensionsCount();
40 assert(axis < concat_dimensions);
41
42 [[maybe_unused]] int64_t concat_size = 0;
43 for (int i = 0; i < inputs_count; i++)
44 {
45 assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
46 for (int j = 0; j < concat_dimensions; j++)
47 {
48 if (j != axis)
49 {
50 [[maybe_unused]] auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j);
51 }
52 }
53 concat_size += input_shapes[i]->Dims(axis);
54 }
55 assert(concat_size == output_shape.Dims(axis));
56 int64_t outer_size = 1;
57 for (int i = 0; i < axis; ++i)
58 {
59 outer_size *= output_shape.Dims(i);
60 }
61 // For all input arrays,
62 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
63 int64_t base_inner_size = 1;
64 for (int i = axis + 1; i < concat_dimensions; ++i)
65 {
66 base_inner_size *= output_shape.Dims(i);
67 }
68
69 Scalar *output_ptr = output_data;
70 for (int k = 0; k < outer_size; k++)
71 {
72 for (int i = 0; i < inputs_count; ++i)
73 {
74 const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
75 memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
76 output_ptr += copy_size;
77 }
78 }
79}

References nnfw::cker::ConcatenationParams::axis, nnfw::cker::Shape::Dims(), nnfw::cker::ConcatenationParams::inputs_count, MatchingDim(), and output_shape.

◆ ConcatenationWithScaling()

void nnfw::cker::ConcatenationWithScaling ( const ConcatenationParams params,
const Shape *const *  input_shapes,
const uint8_t *const *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)
inline

Definition at line 83 of file Concatenation.h.

87{
88 int axis = params.axis;
89 const int32_t *input_zeropoint = params.input_zeropoint;
90 const float *input_scale = params.input_scale;
91 int inputs_count = params.inputs_count;
92 const int32_t output_zeropoint = params.output_zeropoint;
93 const float output_scale = params.output_scale;
94
95 const int concat_dimensions = output_shape.DimensionsCount();
96 assert(axis <= concat_dimensions);
97
98 [[maybe_unused]] int64_t concat_size = 0;
99 for (int i = 0; i < inputs_count; i++)
100 {
101 assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
102 for (int j = 0; j < concat_dimensions; j++)
103 {
104 if (j != axis)
105 {
106 assert(input_shapes[i]->Dims(j) == output_shape.Dims(j));
107 }
108 }
109 concat_size += input_shapes[i]->Dims(axis);
110 }
111 assert(concat_size == output_shape.Dims(axis));
112 int64_t outer_size = 1;
113 for (int i = 0; i < axis; ++i)
114 {
115 outer_size *= output_shape.Dims(i);
116 }
117 // For all input arrays,
118 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
119 int64_t base_inner_size = 1;
120 for (int i = axis + 1; i < concat_dimensions; ++i)
121 {
122 base_inner_size *= output_shape.Dims(i);
123 }
124
125 const float inverse_output_scale = 1.f / output_scale;
126 uint8_t *output_ptr = output_data;
127 for (int k = 0; k < outer_size; k++)
128 {
129 for (int i = 0; i < inputs_count; ++i)
130 {
131 const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
132 const uint8_t *input_ptr = input_data[i] + k * copy_size;
133 if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
134 {
135 memcpy(output_ptr, input_ptr, copy_size);
136 }
137 else
138 {
139 const float scale = input_scale[i] * inverse_output_scale;
140 const float bias = -input_zeropoint[i] * scale;
141 for (int j = 0; j < copy_size; ++j)
142 {
143 const int32_t value =
144 static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
145 output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
146 }
147 }
148 output_ptr += copy_size;
149 }
150 }
151}
list input_data
Definition infer.py:29
Definition Dims.h:26
const int32_t * input_zeropoint
Definition Types.h:224

References nnfw::cker::ConcatenationParams::axis, nnfw::cker::Shape::Dims(), nnfw::cker::ConcatenationParams::input_scale, nnfw::cker::ConcatenationParams::input_zeropoint, nnfw::cker::ConcatenationParams::inputs_count, nnfw::cker::ConcatenationParams::output_scale, output_shape, and nnfw::cker::ConcatenationParams::output_zeropoint.

Referenced by onert::backend::cpu::ops::ConcatLayer::concatenationQuant8().

◆ CopyDimsToDesc()

template<int N>
void nnfw::cker::CopyDimsToDesc ( const Shape input_shape,
NdArrayDesc< N > *  desc_out 
)
inline

Definition at line 277 of file Utils.h.

278{
279 int desc_stride = 1;
280 for (int i = N - 1; i >= 0; --i)
281 {
282 desc_out->extents[i] = input_shape.Dims(i);
283 desc_out->strides[i] = desc_stride;
284 desc_stride *= input_shape.Dims(i);
285 }
286}

References nnfw::cker::Shape::Dims(), nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

◆ CopyMultipleTimes()

template<typename T , typename M >
void nnfw::cker::CopyMultipleTimes ( const T *  in_data,
int32_t  in_size,
multiplier,
T *  out_data 
)

Definition at line 29 of file Tile.h.

30{
31 for (M i = 0; i < multiplier; ++i)
32 {
33 const T *in_end = in_data + in_size;
34 T *new_out_data = std::copy(in_data, in_end, out_data);
35 in_data = out_data;
36 out_data = new_out_data;
37 }
38}

Referenced by TileOneDimension().

◆ Cos()

void nnfw::cker::Cos ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 41 of file Elementwise.h.

43{
44 const int size = MatchingFlatSize(input_shape, output_shape);
45 for (int i = 0; i < size; i++)
46 {
47 output_data[i] = std::cos(input_data[i]);
48 }
49}

References MatchingFlatSize(), output_shape, and size.

◆ CountLeadingZeros()

int nnfw::cker::CountLeadingZeros ( uint32_t  integer_input)
inline

Definition at line 152 of file Utils.h.

153{
154 const uint32_t one_in_leading_positive = 1U << 31;
155 int leading_zeros = 0;
156 while (integer_input < one_in_leading_positive)
157 {
158 integer_input <<= 1;
159 ++leading_zeros;
160 }
161 return leading_zeros;
162}

Referenced by GetInvSqrtQuantizedMultiplierExp().

◆ CwiseClipping()

void nnfw::cker::CwiseClipping ( float *  vector,
const int  v_size,
const float  clipping_value 
)
inline

Definition at line 34 of file TensorUtils.h.

35{
36 NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
37}
#define NEON_OR_PORTABLE(funcname,...)
Definition neon_check.h:47

References CwiseClipping(), and NEON_OR_PORTABLE.

Referenced by CalculateLstmOutputFloat(), CwiseClipping(), and UpdateLstmCellFloat().

◆ DepthToSpace()

template<typename T >
void nnfw::cker::DepthToSpace ( const Shape unextended_input_shape,
const T *  input_data,
const Shape unextended_output_shape,
T *  output_data,
int32_t  block_size 
)
inline

Definition at line 30 of file DepthToSpace.h.

32{
33 assert(unextended_input_shape.DimensionsCount() <= 4);
34 assert(unextended_output_shape.DimensionsCount() <= 4);
35 const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
36 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
37
38 const int input_depth = input_shape.Dims(3);
39 const int input_width = input_shape.Dims(2);
40 const int input_height = input_shape.Dims(1);
41
42 const int output_depth = output_shape.Dims(3);
43 const int batch_size = output_shape.Dims(0);
44
45 // Number of continuous values that we can copy in one interation.
46 const int stride = block_size * output_depth;
47
48 for (int batch = 0; batch < batch_size; ++batch)
49 {
50 for (int in_h = 0; in_h < input_height; ++in_h)
51 {
52 const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
53 for (int offset_h = 0; offset_h < block_size; ++offset_h)
54 {
55 const T *src = input_ptr;
56 for (int in_w = 0; in_w < input_width; ++in_w)
57 {
58 memcpy(output_data, src, stride * sizeof(T));
59 output_data += stride;
60 src += input_depth;
61 }
62 input_ptr += stride;
63 }
64 }
65 }
66}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

◆ DepthwiseConv()

template<typename T , typename TS >
void nnfw::cker::DepthwiseConv ( const DepthwiseConvParams params,
const Shape input_shape,
const T *  input_data,
const Shape filter_shape,
const T *  filter_data,
const Shape bias_shape,
const TS *  bias_data,
const Shape output_shape,
T *  output_data,
ruy::Context *  ruy_context 
)
inline

Definition at line 124 of file DepthwiseConv.h.

128{
129 assert(input_shape.DimensionsCount() == 4);
130 assert(filter_shape.DimensionsCount() == 4);
131 assert(output_shape.DimensionsCount() == 4);
132
133 int thread_count = HowManyConvThreads(output_shape, filter_shape);
134
135 // NOTE Borrow RuyContext to get max_num_threads setting
136 // TODO Define and use max_num_threads for CPU backend
137 const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
138
139 thread_count = std::max(1, std::min(thread_count, max_threads));
140 // Cap the number of threads to 2 for float path to avoid regression in
141 // performance (b/132294857).
142 if (std::is_floating_point<T>::value)
143 {
144 thread_count = std::min(thread_count, 2);
145 }
146
147 const int output_batches = output_shape.Dims(0);
148 const int output_height = output_shape.Dims(1);
149
150 if (thread_count == 1)
151 {
152 optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
153 bias_shape, bias_data, output_shape, output_data, 0, output_height,
154 1);
155 return;
156 }
157
158 int thread_dim, thread_dim_size;
159 if (MultithreadAlongBatches(thread_count, output_batches))
160 {
161 thread_dim = 0;
162 thread_dim_size = output_batches;
163 }
164 else
165 {
166 thread_dim = 1;
167 thread_dim_size = output_height;
168 }
169
170 std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
171 // TODO(b/131746020) don't create new heap allocations every time.
172 // At least we make it a single heap allocation by using reserve().
173 tasks.reserve(thread_count);
174 int thread_start = 0;
175 for (int i = 0; i < thread_count; ++i)
176 {
177 int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
178 tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
179 bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
180 thread_start = thread_end;
181 }
182 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
183}
int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)

References nnfw::cker::optimized::DepthwiseConvImpl(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::cpu_backend_threadpool::Execute(), HowManyConvThreads(), MultithreadAlongBatches(), and output_shape.

◆ DepthwiseConvOp()

void nnfw::cker::DepthwiseConvOp ( const DepthwiseConvParams params,
const Shape input_shape,
const float *  input_data,
const Shape filter_shape,
const float *  filter_data,
const Shape bias_shape,
const float *  bias_data,
float *  padded_filter_data,
bool  pad_filter,
float *  filter_buffers_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 185 of file DepthwiseConv.h.

190{
191 if (params.stride_height != params.stride_width)
192 throw std::runtime_error("Not support different length strides");
193
194 if (params.dilation_height_factor != 1 || params.dilation_width_factor != 1)
195 throw std::runtime_error{"Not support dilation other than 1."};
196
197 const int batch = MatchingDim(input_shape, 0, output_shape, 0);
198 const int input_depth = input_shape.Dims(3);
199 const int output_depth = output_shape.Dims(3);
200 const int input_height = input_shape.Dims(1);
201 const int input_width = input_shape.Dims(2);
202 const int filter_height = filter_shape.Dims(1);
203 const int filter_width = filter_shape.Dims(2);
204 const int output_height = output_shape.Dims(1);
205 const int output_width = output_shape.Dims(2);
206 const int stride = params.stride_height;
207 const int depth_multiplier = params.depth_multiplier;
208 const int pad_height = params.padding_values.height;
209 const int pad_width = params.padding_values.width;
210 const float activation_min = params.float_activation_min;
211 const float activation_max = params.float_activation_max;
212
213 depthwise_conv_op::LaunchDepthwiseConvOp<Eigen::ThreadPoolDevice, float>()(
214 batch, input_height, input_width, input_depth, filter_height, filter_width, depth_multiplier,
215 stride, pad_height, pad_width, output_height, output_width, output_depth, input_data,
216 filter_data, padded_filter_data, pad_filter, filter_buffers_data, output_data);
217
218 if (bias_data != nullptr)
219 {
220 bias_op::biasHelper<float>(bias_shape, bias_data, output_shape, output_data, activation_min,
221 activation_max);
222 }
223}
PaddingValues padding_values
Definition Types.h:234

References nnfw::cker::DepthwiseConvParams::depth_multiplier, nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::Dims(), nnfw::cker::DepthwiseConvParams::float_activation_max, nnfw::cker::DepthwiseConvParams::float_activation_min, nnfw::cker::PaddingValues::height, MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convFloat32().

◆ Dequantize() [1/3]

void nnfw::cker::Dequantize ( const Shape input_shape,
const int16_t *  input_data,
const Shape output_shape,
float *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 115 of file Dequantize.h.

118{
119 const int flat_size = MatchingFlatSize(input_shape, output_shape);
120
121 int i = 0;
122#ifdef USE_NEON
123 const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
124 const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
125 for (; i <= flat_size - 8; i += 8)
126 {
127 const int16x4_t input_s16_low = vld1_s16(input_data + i);
128 const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
129 const int32x4_t val_low = vmovl_s16(input_s16_low);
130 const int32x4_t val_high = vmovl_s16(input_s16_high);
131
132 float32x4_t result_low, result_high;
133 ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
134 ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
135
136 vst1q_f32(output_data + i, result_low);
137 vst1q_f32(output_data + i + 4, result_high);
138 }
139#endif // NEON
140 for (; i < flat_size; ++i)
141 {
142 const int32_t val = input_data[i];
143 const float result = static_cast<float>(scale * (val - zero_point));
144 output_data[i] = result;
145 }
146}

References MatchingFlatSize(), and output_shape.

◆ Dequantize() [2/3]

void nnfw::cker::Dequantize ( const Shape input_shape,
const int8_t *  input_data,
const Shape output_shape,
float *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 80 of file Dequantize.h.

83{
84 const int flat_size = MatchingFlatSize(input_shape, output_shape);
85
86 int i = 0;
87#ifdef USE_NEON
88 const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
89 const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
90 for (; i <= flat_size - 8; i += 8)
91 {
92 const int8x8_t input_s8 = vld1_s8(input_data + i);
93 const int16x8_t input_s16 = vmovl_s8(input_s8);
94 const int16x4_t input_s16_low = vget_low_s16(input_s16);
95 const int16x4_t input_s16_high = vget_high_s16(input_s16);
96 const int32x4_t val_low = vmovl_s16(input_s16_low);
97 const int32x4_t val_high = vmovl_s16(input_s16_high);
98
99 float32x4_t result_low, result_high;
100 ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
101 ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
102
103 vst1q_f32(output_data + i, result_low);
104 vst1q_f32(output_data + i + 4, result_high);
105 }
106#endif // NEON
107 for (; i < flat_size; ++i)
108 {
109 const int32_t val = input_data[i];
110 const float result = static_cast<float>(scale * (val - zero_point));
111 output_data[i] = result;
112 }
113}

References MatchingFlatSize(), and output_shape.

◆ Dequantize() [3/3]

void nnfw::cker::Dequantize ( const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
float *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 44 of file Dequantize.h.

47{
48 const int flat_size = MatchingFlatSize(input_shape, output_shape);
49
50 int i = 0;
51#ifdef USE_NEON
52 const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
53 const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
54 for (; i <= flat_size - 8; i += 8)
55 {
56 const uint8x8_t input_u8 = vld1_u8(input_data + i);
57 const uint16x8_t input_u16 = vmovl_u8(input_u8);
58 const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
59 const int16x4_t input_s16_low = vget_low_s16(input_s16);
60 const int16x4_t input_s16_high = vget_high_s16(input_s16);
61 const int32x4_t val_low = vmovl_s16(input_s16_low);
62 const int32x4_t val_high = vmovl_s16(input_s16_high);
63
64 float32x4_t result_low, result_high;
65 ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
66 ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
67
68 vst1q_f32(output_data + i, result_low);
69 vst1q_f32(output_data + i + 4, result_high);
70 }
71#endif // NEON
72 for (; i < flat_size; ++i)
73 {
74 const int32_t val = input_data[i];
75 const float result = static_cast<float>(scale * (val - zero_point));
76 output_data[i] = result;
77 }
78}

References MatchingFlatSize(), and output_shape.

◆ ELU()

void nnfw::cker::ELU ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 30 of file ELU.h.

32{
33 const int flat_size = MatchingFlatSize(input_shape, output_shape);
34 for (int i = 0; i < flat_size; ++i)
35 {
36 const float val = input_data[i];
37 output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
38 }
39}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ end()

◆ EqualFn()

template<typename T >
bool nnfw::cker::EqualFn ( lhs,
rhs 
)
inline

Definition at line 30 of file Comparison.h.

30{ return lhs == rhs; }

◆ Erf()

void nnfw::cker::Erf ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 29 of file Erf.h.

31{
32 const int size = MatchingFlatSize(input_shape, output_shape);
33 for (int i = 0; i < size; i++)
34 {
35 output_data[i] = std::erf(input_data[i]);
36 }
37}

References MatchingFlatSize(), output_shape, and size.

◆ Exp()

void nnfw::cker::Exp ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 30 of file Exp.h.

32{
33 const int size = MatchingFlatSize(input_shape, output_shape);
34 for (int i = 0; i < size; i++)
35 {
36 output_data[i] = std::exp(input_data[i]);
37 }
38}

References MatchingFlatSize(), output_shape, and size.

◆ Fill() [1/2]

template<typename T >
void nnfw::cker::Fill ( const T *  value_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 28 of file Fill.h.

29{
30 int output_size = output_shape.FlatSize();
31 for (int i = 0; i < output_size; i++)
32 {
33 output_data[i] = *value_data;
34 }
35}

References output_shape.

◆ Fill() [2/2]

template<typename Device , class Distribution >
void nnfw::cker::Fill ( random::PhiloxRandom  random,
Tensor output 
)

Definition at line 64 of file StatelessRandomUniform.h.

65{
66 // Build distribution
67 typedef typename Distribution::ResultElementType T;
68
69 auto flat = output->flat<T>();
70 // Reuse the compute kernels from the stateful random ops
71 functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(),
72 Distribution());
73}

◆ FlatSizeSkipDim()

int nnfw::cker::FlatSizeSkipDim ( const Shape shape,
int  skip_dim 
)
inline

Definition at line 253 of file Shape.h.

254{
255 const int dims_count = shape.DimensionsCount();
256 assert(skip_dim >= 0 && skip_dim < dims_count);
257 const auto *dims_data = shape.DimsData();
258 int flat_size = 1;
259 for (int i = 0; i < dims_count; ++i)
260 {
261 flat_size *= (i == skip_dim) ? 1 : dims_data[i];
262 }
263 return flat_size;
264}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsData().

Referenced by nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::DepthwiseConvImpl(), FullyConnected(), FullyConnectedSparseWeight16x1(), FullyConnectedSparseWeightRandom(), nnfw::cker::optimized_integer_ops::HowManyConvThreads(), MapAsMatrixWithLastDimAsRows(), MatchingFlatSizeSkipDim(), nnfw::cker::train::MSE(), and nnfw::cker::train::MSEGrad().

◆ Floor()

void nnfw::cker::Floor ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 90 of file Elementwise.h.

92{
93 const int flat_size = MatchingFlatSize(input_shape, output_shape);
94
95 for (int i = 0; i < flat_size; i++)
96 {
97 output_data[i] = std::floor(input_data[i]);
98 }
99}

References MatchingFlatSize(), and output_shape.

◆ FloorDivBroadcast()

template<typename T >
void nnfw::cker::FloorDivBroadcast ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 30 of file FloorDiv.h.

33{
34 assert(unextended_input1_shape.DimensionsCount() <= 4);
35 assert(unextended_input2_shape.DimensionsCount() <= 4);
36 assert(unextended_output_shape.DimensionsCount() <= 4);
37 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
38
41 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
42 &desc2);
43
44 for (int b = 0; b < output_shape.Dims(0); ++b)
45 {
46 for (int y = 0; y < output_shape.Dims(1); ++y)
47 {
48 for (int x = 0; x < output_shape.Dims(2); ++x)
49 {
50 for (int c = 0; c < output_shape.Dims(3); ++c)
51 {
52 auto out_idx = Offset(output_shape, b, y, x, c);
53 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
54 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
55 auto in1_val = input1_data[in1_idx];
56 auto in2_val = input2_data[in2_idx];
57 output_data[out_idx] = std::floor(
58 std::divides<double>()(static_cast<double>(in1_val), static_cast<double>(in2_val)));
59 }
60 }
61 }
62 }
63}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ FloorDivElementwise()

template<typename T >
void nnfw::cker::FloorDivElementwise ( const Shape shape,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 66 of file FloorDiv.h.

68{
69
70 int num_elements = shape.FlatSize();
71
72 for (int t = 0; t < num_elements; t++)
73 {
74 output_data[t] = std::floor(std::divides<double>()(static_cast<double>(input1_data[t]),
75 static_cast<double>(input2_data[t])));
76 }
77}

References nnfw::cker::Shape::FlatSize().

◆ FloorModBroadcast()

template<typename T >
void nnfw::cker::FloorModBroadcast ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 34 of file FloorMod.h.

37{
38 struct FloatMod
39 {
40 float operator()(const float lhs, const float rhs) const { return std::fmod(lhs, rhs); }
41 };
42
43 using ModFunc =
44 typename std::conditional<std::is_integral<T>::value, std::modulus<T>, FloatMod>::type;
45
46 if (unextended_output_shape.DimensionsCount() > 4)
47 throw std::runtime_error(std::string("cker::FloorModBroadcast: Unsupported rank size : ") +
48 std::to_string(unextended_output_shape.DimensionsCount()));
49 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
50
53 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
54 &desc2);
55
56 for (int b = 0; b < output_shape.Dims(0); ++b)
57 {
58 for (int y = 0; y < output_shape.Dims(1); ++y)
59 {
60 for (int x = 0; x < output_shape.Dims(2); ++x)
61 {
62 for (int c = 0; c < output_shape.Dims(3); ++c)
63 {
64 auto out_idx = Offset(output_shape, b, y, x, c);
65 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
66 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
67 auto in1_val = input1_data[in1_idx];
68 auto in2_val = input2_data[in2_idx];
69
70 ModFunc mod_func;
71 T trunc_mod = mod_func(in1_val, in2_val);
72 output_data[out_idx] = (trunc_mod != 0) && ((in2_val < 0) != (trunc_mod < 0))
73 ? (trunc_mod + in2_val)
74 : trunc_mod;
75 }
76 }
77 }
78 }
79}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ FloorModElementwise()

template<typename T >
void nnfw::cker::FloorModElementwise ( const Shape shape,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 82 of file FloorMod.h.

84{
85 struct FloatMod
86 {
87 float operator()(const float lhs, const float rhs) const { return std::fmod(lhs, rhs); }
88 };
89
90 using ModFunc =
91 typename std::conditional<std::is_integral<T>::value, std::modulus<T>, FloatMod>::type;
92
93 int num_elements = shape.FlatSize();
94 for (int t = 0; t < num_elements; t++)
95 {
96 ModFunc mod_func;
97 auto in1_val = input1_data[t];
98 auto in2_val = input2_data[t];
99 T trunc_mod = mod_func(in1_val, in2_val);
100 output_data[t] =
101 (trunc_mod != 0) && ((in2_val < 0) != (trunc_mod < 0)) ? (trunc_mod + in2_val) : trunc_mod;
102 }
103}

References nnfw::cker::Shape::FlatSize().

◆ FullyConnected() [1/2]

void nnfw::cker::FullyConnected ( const FullyConnectedParams params,
const Shape input_shape,
const float *  input_data,
const Shape weights_shape,
const float *  weights_data,
const Shape ,
const float *  bias_data,
const Shape ,
float *  output_data 
)
inline

Definition at line 98 of file FullyConnected.h.

102{
103 int total_input_size = input_shape.FlatSize();
104 int input_size = weights_shape.Dims(1);
105 const int batch_size = total_input_size / input_size;
106 const int num_units = weights_shape.Dims(0);
107
108 // Output = bias if bias tensor exists.
109 if (bias_data)
110 {
111 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
112 }
113 else
114 {
115 ZeroVector(output_data, batch_size * num_units);
116 }
117
118 // Compute output += weight * input
119 MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
120 output_data, /*result_stride=*/1);
121
123 {
124 // Apply activation function
125 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
126 }
127}
FusedActivationFunctionType activation
Definition Types.h:257

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), kNone, MatrixBatchVectorMultiplyAccumulate(), VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedFloat32(), and onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedQuant8().

◆ FullyConnected() [2/2]

void nnfw::cker::FullyConnected ( const FullyConnectedParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape filter_shape,
const uint8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
uint8_t *  output_data 
)
inline

Definition at line 131 of file FullyConnected.h.

136{
137 const int32_t input_offset = params.input_offset;
138 const int32_t filter_offset = params.weights_offset;
139 const int32_t output_offset = params.output_offset;
140 const int32_t output_multiplier = params.output_multiplier;
141 const int output_shift = params.output_shift;
142 const int32_t output_activation_min = params.quantized_activation_min;
143 const int32_t output_activation_max = params.quantized_activation_max;
144 assert(filter_shape.DimensionsCount() >= 2);
145 assert(output_shape.DimensionsCount() >= 1);
146
147 assert(output_activation_min <= output_activation_max);
148 // TODO(benoitjacob): This really should be:
149 // const int batches = ArraySize(output_dims, 1);
150 // but the current --variable_batch hack consists in overwriting the 3rd
151 // dimension with the runtime batch size, as we don't keep track for each
152 // array of which dimension is the batch dimension in it.
153 const int output_dim_count = output_shape.DimensionsCount();
154 const int filter_dim_count = filter_shape.DimensionsCount();
155 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
156 const int output_depth =
157 MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
158 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
159 for (int b = 0; b < batches; ++b)
160 {
161 for (int out_c = 0; out_c < output_depth; ++out_c)
162 {
163 int32_t acc = 0;
164 for (int d = 0; d < accum_depth; ++d)
165 {
166 int32_t input_val = input_data[b * accum_depth + d];
167 int32_t filter_val = filter_data[out_c * accum_depth + d];
168 acc += (filter_val + filter_offset) * (input_val + input_offset);
169 }
170 if (bias_data)
171 {
172 acc += bias_data[out_c];
173 }
174 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
175 acc += output_offset;
176 acc = std::max(acc, output_activation_min);
177 acc = std::min(acc, output_activation_max);
178 output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
179 }
180 }
181}
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), nnfw::cker::FullyConnectedParams::input_offset, MatchingDim(), MultiplyByQuantizedMultiplier(), nnfw::cker::FullyConnectedParams::output_multiplier, nnfw::cker::FullyConnectedParams::output_offset, output_shape, nnfw::cker::FullyConnectedParams::output_shift, nnfw::cker::FullyConnectedParams::quantized_activation_max, nnfw::cker::FullyConnectedParams::quantized_activation_min, and nnfw::cker::FullyConnectedParams::weights_offset.

◆ FullyConnectedHybrid()

void nnfw::cker::FullyConnectedHybrid ( const FullyConnectedParams params,
const Shape input_shape,
const float *  input_data,
const Shape filter_shape,
const int8_t *  filter_data,
const Shape ,
const float *  bias_data,
const Shape output_shape,
float *  output_data,
FCTempArena temp_arena,
ruy::Context *  ruy_context 
)
inline

Definition at line 183 of file FullyConnected.h.

189{
190 int total_input_size = input_shape.FlatSize();
191 const int input_size = filter_shape.Dims(1);
192 const int batch_size = total_input_size / input_size;
193 const int num_units = filter_shape.Dims(0);
194
195 // Output = bias if bias tensor exists.
196 if (bias_data)
197 {
198 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
199 }
200 else
201 {
202 ZeroVector(output_data, batch_size * num_units);
203 }
204
205 // Save matrix multiplication computation for all zero input.
206 if (IsZeroVector(input_data, total_input_size))
207 {
208 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
209 return;
210 }
211
212 // Quantize input from float to uint8 + quantization params (scaling factor).
213 float unused_min, unused_max;
214 float *scaling_factors_ptr = temp_arena.scaling_factors.data();
215 int8_t *quant_data = temp_arena.input_quantized.data();
216
217 // Quantize each batch independently.
218 for (int b = 0; b < batch_size; ++b)
219 {
220 const int offset = b * input_size;
221 SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
222 &unused_max, &scaling_factors_ptr[b]);
223 // Incorporate scaling of the filter.
224 scaling_factors_ptr[b] *= params.weights_scale;
225 }
226
227// Compute output += weight * quantized_input
228#ifdef USE_RUY_GEMV
229 auto output_size = output_shape.FlatSize();
230 temp_arena.accum_scratch.resize(output_size);
231 int32_t *scratch = temp_arena.accum_scratch.data();
232 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
233 scaling_factors_ptr, batch_size, scratch, output_data,
234 /*result_stride=*/1, ruy_context);
235#else
236 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
237 scaling_factors_ptr, batch_size, output_data,
238 /*result_stride=*/1);
239#endif
240
241 // Apply activation function to floats.
243 {
244 // Apply activation function
245 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
246 }
247 return;
248}
std::vector< int8_t > input_quantized
std::vector< float > scaling_factors
std::vector< int32_t > accum_scratch
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, float *min, float *max, float *scaling_factor)

References nnfw::cker::FCTempArena::accum_scratch, nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), nnfw::cker::FCTempArena::input_quantized, IsZeroVector(), kNone, MatrixBatchVectorMultiplyAccumulate(), offset(), output_shape, nnfw::cker::FCTempArena::scaling_factors, SymmetricQuantizeFloats(), VectorBatchVectorAssign(), nnfw::cker::FullyConnectedParams::weights_scale, and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid().

◆ FullyConnectedSparseWeight16x1()

void nnfw::cker::FullyConnectedSparseWeight16x1 ( const FullyConnectedParams params,
const Shape input_shape,
const float *  input_data,
const Shape weights_shape,
const float *  weights_data,
const Shape bias_shape,
const float *  bias_data,
const Shape output_shape,
float *  output_data,
const uint16_t *  w1_segments,
const uint16_t *  w1_indices 
)
inline

Definition at line 57 of file FullyConnectedSparse16x1.h.

62{
63 assert(weights_shape.DimensionsCount() == 2);
64 assert(output_shape.DimensionsCount() == 2);
65
66 const int output_dims_count = output_shape.DimensionsCount();
67 const int weights_dims_count = weights_shape.DimensionsCount();
68 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
69 const int output_depth =
70 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
71 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
72
73 if (bias_data)
74 {
75 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
76 }
77 else
78 {
79 ZeroVector(output_data, batches * output_depth);
80 }
81 for (int b = 0; b < batches; ++b)
82 {
83 int depth_size = output_depth / 16;
84 for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
85#ifdef USE_NEON
86 {
87 float *__restrict y;
88 y = &output_data[b * output_depth + idx_0 * 16];
89 /* keep y[0..15] in registers for duration of inner loop */
90 float32x4_t y0_3 = vld1q_f32(&y[0]);
91 float32x4_t y4_7 = vld1q_f32(&y[4]);
92 float32x4_t y8_11 = vld1q_f32(&y[8]);
93 float32x4_t y12_15 = vld1q_f32(&y[12]);
94 for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
95 {
96 auto idx_1 = w1_indices[pw1];
97 float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
98 float32x4_t wvec;
99
100 wvec = vld1q_f32(&weights_data[0]);
101 y0_3 = vmlaq_f32(y0_3, wvec, xj);
102 wvec = vld1q_f32(&weights_data[4]);
103 y4_7 = vmlaq_f32(y4_7, wvec, xj);
104 wvec = vld1q_f32(&weights_data[8]);
105 y8_11 = vmlaq_f32(y8_11, wvec, xj);
106 wvec = vld1q_f32(&weights_data[12]);
107 y12_15 = vmlaq_f32(y12_15, wvec, xj);
108
109 weights_data += 16;
110 }
111 /* save y[0..15] back to memory */
112 vst1q_f32(&y[0], y0_3);
113 vst1q_f32(&y[4], y4_7);
114 vst1q_f32(&y[8], y8_11);
115 vst1q_f32(&y[12], y12_15);
116 }
117#else
118 {
119 for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
120 {
121 float *__restrict y;
122 float xj;
123 auto idx_1 = w1_indices[pw1];
124 xj = input_data[b * accum_depth + idx_1];
125 y = &output_data[b * output_depth + idx_0 * 16];
126 y[0] += weights_data[0] * xj;
127 y[1] += weights_data[1] * xj;
128 y[2] += weights_data[2] * xj;
129 y[3] += weights_data[3] * xj;
130 y[4] += weights_data[4] * xj;
131 y[5] += weights_data[5] * xj;
132 y[6] += weights_data[6] * xj;
133 y[7] += weights_data[7] * xj;
134 y[8] += weights_data[8] * xj;
135 y[9] += weights_data[9] * xj;
136 y[10] += weights_data[10] * xj;
137 y[11] += weights_data[11] * xj;
138 y[12] += weights_data[12] * xj;
139 y[13] += weights_data[13] * xj;
140 y[14] += weights_data[14] * xj;
141 y[15] += weights_data[15] * xj;
142 weights_data += 16;
143 }
144 }
145#endif
146 }
148 {
149 // Apply activation function
150 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
151 }
152}

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), kNone, MatchingDim(), output_shape, VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight().

◆ FullyConnectedSparseWeightRandom()

void nnfw::cker::FullyConnectedSparseWeightRandom ( const FullyConnectedParams params,
const Shape input_shape,
const float *  input_data,
const Shape weights_shape,
const float *  weights_data,
const Shape bias_shape,
const float *  bias_data,
const Shape output_shape,
float *  output_data,
const uint16_t *  w1_segments,
const uint16_t *  w1_indices 
)
inline

Definition at line 250 of file FullyConnected.h.

255{
256
257 assert(weights_shape.DimensionsCount() == 2);
258 assert(output_shape.DimensionsCount() == 2);
259
260 const int output_dims_count = output_shape.DimensionsCount();
261 const int weights_dims_count = weights_shape.DimensionsCount();
262 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
263 const int output_depth =
264 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
265 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
266
267 if (bias_data)
268 {
269 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
270 }
271 else
272 {
273 ZeroVector(output_data, batches * output_depth);
274 }
275 for (int b = 0; b < batches; ++b)
276 {
277 for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
278 {
279 for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
280 {
281 int idx_1 = w1_indices[pw1];
282 output_data[b * output_depth + idx_0] +=
283 weights_data[pw1] * input_data[b * accum_depth + idx_1];
284 }
285 }
286 }
288 {
289 // Apply activation function
290 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
291 }
292}

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), kNone, MatchingDim(), output_shape, VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight().

◆ Gather()

template<typename T , typename CoordsT = int32_t>
void nnfw::cker::Gather ( const GatherParams op_params,
const Shape input_shape,
const T *  input_data,
const Shape coords_shape,
const CoordsT *  coords_data,
const Shape ,
T *  output_data 
)
inline

Definition at line 31 of file Gather.h.

34{
35 int axis = op_params.axis;
36 if (axis < 0)
37 {
38 axis += input_shape.DimensionsCount();
39 }
40 assert(axis >= 0);
41 assert(axis < input_shape.DimensionsCount());
42 const int axis_size = input_shape.Dims(axis);
43 const int coords_count = coords_shape.FlatSize();
44
45 int outer_size = 1;
46 for (int i = 0; i < axis; ++i)
47 {
48 outer_size *= input_shape.Dims(i);
49 }
50
51 int inner_size = 1;
52 for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
53 {
54 inner_size *= input_shape.Dims(i);
55 }
56
57 for (int outer = 0; outer < outer_size; ++outer)
58 {
59 for (int i = 0; i < coords_count; ++i)
60 {
61 assert(coords_data[i] >= 0);
62 assert(coords_data[i] < axis_size);
63 std::memcpy(output_data + (outer * coords_count + i) * inner_size,
64 input_data + (outer * axis_size + coords_data[i]) * inner_size,
65 sizeof(T) * inner_size);
66 }
67 }
68}

References nnfw::cker::GatherParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

◆ GenerateKey()

void nnfw::cker::GenerateKey ( Tensor  seed,
random::PhiloxRandom::Key out_key,
random::PhiloxRandom::ResultType out_counter 
)

Definition at line 37 of file StatelessRandomUniform.h.

39{
40 // Grab the two seeds
41 uint32_t seed0;
42 uint32_t seed1;
43
44 const auto seed_vals = seed.flat<int32_t>();
45
46 seed0 = seed_vals(0);
47 seed1 = seed_vals(1);
48 // Scramble the seeds so that the user doesn't need to worry about which
49 // part of the seed needs to be strong.
50 (*out_key)[0] = 0x3ec8f720;
51 (*out_key)[1] = 0x02461e29;
52 (*out_counter)[0] = static_cast<uint32_t>(seed0);
53 (*out_counter)[1] = (*out_counter)[3] = 0;
54 (*out_counter)[2] = static_cast<uint32_t>(seed1);
55 const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
56 (*out_key)[0] = mix[0];
57 (*out_key)[1] = mix[1];
58 (*out_counter)[0] = (*out_counter)[1] = 0;
59 (*out_counter)[2] = mix[2];
60 (*out_counter)[3] = mix[3];
61}
TTypes< T >::Flat flat()
Definition Tensor.h:127

References nnfw::cker::Tensor::flat().

Referenced by StatelessRandomUniform().

◆ GetIndexRange()

void nnfw::cker::GetIndexRange ( int  spatial_index_dim,
int  block_shape_dim,
int  input_dim,
int  output_dim,
int *  start_index,
int *  end_index 
)
inline

Definition at line 37 of file BatchToSpaceND.h.

39{
40 // (*start_index) * block_shape_dim is effectively rounded up to the next
41 // multiple of block_shape_dim by the integer division.
42 *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
43 // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
44 // end_index is exclusive).
45 *end_index =
46 std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
47}

Referenced by BatchToSpaceND().

◆ GetInvSqrtQuantizedMultiplierExp()

void nnfw::cker::GetInvSqrtQuantizedMultiplierExp ( int32_t  input,
int  reverse_shift,
int32_t *  output_inv_sqrt,
int *  output_shift 
)
inline

Definition at line 164 of file Utils.h.

166{
167 assert(input >= 0);
168 if (input <= 1)
169 {
170 // Handle the input value 1 separately to avoid overflow in that case
171 // in the general computation below (b/143972021). Also handle 0 as if it
172 // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
173 // but rare/unrealistic input value. We can expect both to occur in some
174 // incompletely trained models, but probably not in fully trained models.
175 *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
176 *output_shift = 0;
177 return;
178 }
179 assert(input > 1);
180 *output_shift = 11;
181 while (input >= (1 << 29))
182 {
183 input /= 4;
184 ++*output_shift;
185 }
186 const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
187 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
188 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
189 *output_shift -= left_shift_bit_pairs;
190 input <<= 2 * left_shift_bit_pairs;
191 assert(input >= (1 << 27));
192 assert(input < (1 << 29));
193 using gemmlowp::FixedPoint;
194 using gemmlowp::Rescale;
195 using gemmlowp::SaturatingRoundingMultiplyByPOT;
196 // Using 3 integer bits gives us enough room for the internal arithmetic in
197 // this Newton-Raphson iteration.
198 using F3 = FixedPoint<int32_t, 3>;
199 using F0 = FixedPoint<int32_t, 0>;
200 const F3 fixedpoint_input = F3::FromRaw(input >> 1);
201 const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
202 const F3 fixedpoint_half_three =
203 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
204 // Newton-Raphson iteration
205 // Naive unoptimized starting guess: x = 1
206 F3 x = F3::One();
207 // Naive unoptimized number of iterations: 5
208 for (int i = 0; i < 5; i++)
209 {
210 const F3 x3 = Rescale<3>(x * x * x);
211 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
212 }
213 const F0 fixedpoint_half_sqrt_2 =
214 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
215 x = x * fixedpoint_half_sqrt_2;
216 *output_inv_sqrt = x.raw();
217 if (*output_shift < 0)
218 {
219 *output_inv_sqrt <<= -*output_shift;
220 *output_shift = 0;
221 }
222 // Convert right shift (right is positive) to left shift.
223 *output_shift *= reverse_shift;
224}

References CountLeadingZeros().

Referenced by L2NormalizeQuant8().

◆ GetShape()

Shape nnfw::cker::GetShape ( const std::vector< int32_t > &  data)
inline

Definition at line 235 of file Shape.h.

235{ return Shape(data.size(), data.data()); }

◆ GetSize()

template<typename T >
int nnfw::cker::GetSize ( start,
limit,
delta 
)
inline

Definition at line 30 of file Range.h.

31{
32 if (!((start > limit && delta < 0) || (start < limit && delta > 0)))
33 {
34 throw std::runtime_error("Range: invalid input values");
35 }
36
37 int size = (std::is_integral<T>::value
38 ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
39 : std::ceil(std::abs((limit - start) / delta)));
40 return size;
41}

References size.

◆ GreaterEqualFn()

template<typename T >
bool nnfw::cker::GreaterEqualFn ( lhs,
rhs 
)
inline

Definition at line 33 of file Comparison.h.

33{ return lhs >= rhs; }

◆ GreaterFn()

template<typename T >
bool nnfw::cker::GreaterFn ( lhs,
rhs 
)
inline

Definition at line 32 of file Comparison.h.

32{ return lhs > rhs; }

◆ HowManyConvThreads()

int nnfw::cker::HowManyConvThreads ( const Shape output_shape,
const Shape filter_shape 
)
inline

Definition at line 81 of file DepthwiseConv.h.

82{
83 // How many scalar multiplications are needed to make it worth using one
84 // more thread
85 static constexpr int kMinMulPerThread = 1 << 13; // 8k
86 const int filter_height = filter_shape.Dims(1);
87 const int filter_width = filter_shape.Dims(2);
88 const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
89 // Try to avoid real runtime divisions if possible by dividing by a
90 // compile-time constant.
91 int thread_count = std::max(1, num_muls / kMinMulPerThread);
92 return thread_count;
93}

References nnfw::cker::Shape::Dims(), and output_shape.

Referenced by DepthwiseConv().

◆ InitTensorDataForReduce()

template<typename T >
bool nnfw::cker::InitTensorDataForReduce ( const Shape shape,
const T  init_value,
T *  data 
)
inline

Definition at line 208 of file Reduce.h.

209{
210 const auto dims = shape.DimsData();
211 const auto num_dims = shape.DimensionsCount();
212 size_t num_elements = 1;
213 for (int idx = 0; idx < num_dims; ++idx)
214 {
215 size_t current = static_cast<size_t>(dims[idx]);
216 // Overflow prevention.
217 if (num_elements > std::numeric_limits<size_t>::max() / current)
218 {
219 return false;
220 }
221 num_elements *= current;
222 }
223 for (size_t idx = 0; idx < num_elements; ++idx)
224 {
225 data[idx] = init_value;
226 }
227 return true;
228}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsData().

Referenced by nnfw::cker::ReduceMean::PrepareforReduce(), and nnfw::cker::Reduce::ReduceGeneric().

◆ InstanceNorm()

void nnfw::cker::InstanceNorm ( const InstanceNormParams params,
const Shape input_shape,
const float *  input_data,
const Shape gamma_shape,
const float *  gamma_data,
const Shape beta_shape,
const float *  beta_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 31 of file InstanceNorm.h.

35{
36 const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
37 const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
38 const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
39 const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
40 const float output_activation_min = params.float_activation_min;
41 const float output_activation_max = params.float_activation_max;
42
43 assert(output_activation_min <= output_activation_max);
44
45 for (int32_t batch = 0; batch < batches; batch++)
46 {
47 for (int32_t channel = 0; channel < channels; channel++)
48 {
49 double sum = 0.0f;
50 double square_sum = 0.0f;
51 int32_t size = heights * widths;
52
53 for (int32_t height = 0; height < heights; height++)
54 {
55 for (int32_t width = 0; width < widths; width++)
56 {
57 double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
58 sum += input_val;
59 square_sum += (input_val * input_val);
60 }
61 }
62
63 double mean = sum / size;
64 double var = square_sum / size - mean * mean;
65
66 double gamma = gamma_data[channel];
67 double beta = beta_data[channel];
68
69 double a = gamma / (std::sqrt(var + params.epsilon));
70 double b = -mean * a + beta;
71
72 for (int32_t height = 0; height < heights; height++)
73 {
74 for (int32_t width = 0; width < widths; width++)
75 {
76 double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
77 double output_value = input_value * a + b;
78 output_data[Offset(output_shape, batch, height, width, channel)] =
79 ActivationFunctionWithMinMax((float)output_value, output_activation_min,
80 output_activation_max);
81 }
82 }
83 }
84 }
85}

References ActivationFunctionWithMinMax(), nnfw::cker::InstanceNormParams::epsilon, nnfw::cker::InstanceNormParams::float_activation_max, nnfw::cker::InstanceNormParams::float_activation_min, MatchingDim(), Offset(), output_shape, and size.

◆ IsZeroVector()

bool nnfw::cker::IsZeroVector ( const float *  vector,
int  v_size 
)
inline

◆ L2NormalizeFloat32()

void nnfw::cker::L2NormalizeFloat32 ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 30 of file L2Normalize.h.

32{
33 float epsilon = 1e-6;
34 const int trailing_dim = input_shape.DimensionsCount() - 1;
35 const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
36 const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
37 for (int i = 0; i < outer_size; ++i)
38 {
39 float squared_l2_norm = 0;
40 for (int c = 0; c < depth; ++c)
41 {
42 const float val = input_data[c];
43 squared_l2_norm += val * val;
44 }
45 float l2_norm = std::sqrt(squared_l2_norm);
46 l2_norm = std::max(l2_norm, epsilon);
47 for (int c = 0; c < depth; ++c)
48 {
49 *output_data = *input_data / l2_norm;
50 ++output_data;
51 ++input_data;
52 }
53 }
54}
int MatchingFlatSizeSkipDim(const Shape &shape, int skip_dim, const Shape &check_shape_0)
Definition Shape.h:304

References nnfw::cker::Shape::DimensionsCount(), MatchingDim(), MatchingFlatSizeSkipDim(), and output_shape.

Referenced by onert::backend::cpu::ops::L2NormLayer::run().

◆ L2NormalizeQuant8()

void nnfw::cker::L2NormalizeQuant8 ( L2NormParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)

Definition at line 56 of file L2Normalize.h.

58{
59 const int trailing_dim = input_shape.DimensionsCount() - 1;
60 const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
61 const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
62 const int32_t input_zero_point = params.input_zero_point;
63
64 for (int i = 0; i < outer_size; ++i)
65 {
66 int32_t square_l2_norm = 0;
67 for (int c = 0; c < depth; c++)
68 {
69 // Note that input_data advances by depth in the second pass below.
70 int32_t diff = input_data[c] - input_zero_point;
71 square_l2_norm += diff * diff;
72 }
73 int32_t inv_l2norm_multiplier;
74 int inv_l2norm_shift;
75 GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
76 for (int c = 0; c < depth; c++)
77 {
78 int32_t diff = *input_data - input_zero_point;
79 int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
80 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
81 int32_t unclamped_output_val = 128 + rescaled_diff;
82 int32_t output_val = std::min(static_cast<int32_t>(255),
83 std::max(static_cast<int32_t>(0), unclamped_output_val));
84 *output_data = static_cast<uint8_t>(output_val);
85 ++input_data;
86 ++output_data;
87 }
88 }
89}

References nnfw::cker::Shape::DimensionsCount(), GetInvSqrtQuantizedMultiplierExp(), nnfw::cker::L2NormParams::input_zero_point, MatchingDim(), MatchingFlatSizeSkipDim(), MultiplyByQuantizedMultiplierSmallerThanOneExp(), and output_shape.

Referenced by onert::backend::cpu::ops::L2NormLayer::run().

◆ LeakyReLU()

void nnfw::cker::LeakyReLU ( const LeakyReluParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 31 of file LeakyReLU.h.

33{
34 const int flat_size = MatchingFlatSize(input_shape, output_shape);
35
36 for (int i = 0; i < flat_size; i++)
37 {
38 const float val = input_data[i];
39 // Note that alpha might be > 1 or < 0, so we don't use std::max here.
40 output_data[i] = val > 0 ? val : val * params.alpha;
41 }
42}

References nnfw::cker::LeakyReluParams::alpha, MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ LessEqualFn()

template<typename T >
bool nnfw::cker::LessEqualFn ( lhs,
rhs 
)
inline

Definition at line 35 of file Comparison.h.

35{ return lhs <= rhs; }

◆ LessFn()

template<typename T >
bool nnfw::cker::LessFn ( lhs,
rhs 
)
inline

Definition at line 34 of file Comparison.h.

34{ return lhs < rhs; }

◆ Log()

void nnfw::cker::Log ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 80 of file Elementwise.h.

82{
83 const int size = MatchingFlatSize(input_shape, output_shape);
84 for (int i = 0; i < size; i++)
85 {
86 output_data[i] = std::log(input_data[i]);
87 }
88}

References MatchingFlatSize(), output_shape, and size.

◆ LogicalAndBroadcast()

template<typename T >
void nnfw::cker::LogicalAndBroadcast ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 30 of file LogicalAnd.h.

33{
34 assert(unextended_input1_shape.DimensionsCount() <= 4);
35 assert(unextended_input2_shape.DimensionsCount() <= 4);
36 assert(unextended_output_shape.DimensionsCount() <= 4);
37 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
38
41 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
42 &desc2);
43
44 for (int b = 0; b < output_shape.Dims(0); ++b)
45 {
46 for (int y = 0; y < output_shape.Dims(1); ++y)
47 {
48 for (int x = 0; x < output_shape.Dims(2); ++x)
49 {
50 for (int c = 0; c < output_shape.Dims(3); ++c)
51 {
52 auto out_idx = Offset(output_shape, b, y, x, c);
53 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
54 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
55 auto in1_val = input1_data[in1_idx];
56 auto in2_val = input2_data[in2_idx];
57 output_data[out_idx] = in1_val && in2_val;
58 }
59 }
60 }
61 }
62}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ LogicalAndElementwise()

template<typename T >
void nnfw::cker::LogicalAndElementwise ( const Shape shape,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 65 of file LogicalAnd.h.

67{
68
69 int num_elements = shape.FlatSize();
70
71 for (int t = 0; t < num_elements; t++)
72 {
73 output_data[t] = input1_data[t] && input2_data[t];
74 }
75}

References nnfw::cker::Shape::FlatSize().

◆ LogicalNot()

void nnfw::cker::LogicalNot ( const Shape input_shape,
const bool *  input_data,
const Shape output_shape,
bool *  output_data 
)
inline

Definition at line 28 of file LogicalNot.h.

30{
31 const int size = MatchingFlatSize(input_shape, output_shape);
32 for (int i = 0; i < size; i++)
33 {
34 output_data[i] = !input_data[i];
35 }
36}

References MatchingFlatSize(), output_shape, and size.

◆ LogicalOrBroadcast()

template<typename T >
void nnfw::cker::LogicalOrBroadcast ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 30 of file LogicalOr.h.

33{
34 assert(unextended_input1_shape.DimensionsCount() <= 4);
35 assert(unextended_input2_shape.DimensionsCount() <= 4);
36 assert(unextended_output_shape.DimensionsCount() <= 4);
37 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
38
41 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
42 &desc2);
43
44 for (int b = 0; b < output_shape.Dims(0); ++b)
45 {
46 for (int y = 0; y < output_shape.Dims(1); ++y)
47 {
48 for (int x = 0; x < output_shape.Dims(2); ++x)
49 {
50 for (int c = 0; c < output_shape.Dims(3); ++c)
51 {
52 auto out_idx = Offset(output_shape, b, y, x, c);
53 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
54 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
55 auto in1_val = input1_data[in1_idx];
56 auto in2_val = input2_data[in2_idx];
57 output_data[out_idx] = in1_val || in2_val;
58 }
59 }
60 }
61 }
62}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ LogicalOrElementwise()

template<typename T >
void nnfw::cker::LogicalOrElementwise ( const Shape shape,
const T *  input1_data,
const T *  input2_data,
T *  output_data 
)
inline

Definition at line 65 of file LogicalOr.h.

67{
68
69 int num_elements = shape.FlatSize();
70
71 for (int t = 0; t < num_elements; t++)
72 {
73 output_data[t] = input1_data[t] || input2_data[t];
74 }
75}

References nnfw::cker::Shape::FlatSize().

◆ Logistic()

void nnfw::cker::Logistic ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 32 of file Logistic.h.

34{
35 auto input_map = MapAsVector(input_data, input_shape);
36 auto output_map = MapAsVector(output_data, output_shape);
37
38 output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
39}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ LogSoftmax() [1/2]

void nnfw::cker::LogSoftmax ( const SoftmaxParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 34 of file LogSoftMax.h.

36{
37 const int rank = input_shape.DimensionsCount();
38 const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
39 const double beta = params.beta;
40 const int depth = MatchingDim(input_shape, axis, output_shape, axis);
41
42 int outer_size = 1;
43 for (int i = 0; i < axis; ++i)
44 {
45 outer_size *= input_shape.Dims(i);
46 }
47
48 int inner_size = 1;
49 for (int i = axis + 1; i < rank; ++i)
50 {
51 inner_size *= input_shape.Dims(i);
52 }
53
54 for (int i = 0; i < outer_size; ++i)
55 {
56 for (int j = 0; j < inner_size; ++j)
57 {
58 float max = std::numeric_limits<float>::lowest();
59 for (int c = 0; c < depth; ++c)
60 {
61 max = std::max(max, input_data[(i * depth + c) * inner_size]);
62 }
63
64 float sum = 0.f;
65 for (int c = 0; c < depth; ++c)
66 {
67 sum += std::exp((input_data[(i * depth + c) * inner_size + j] - max) * beta);
68 }
69
70 const float log_sum = std::log(sum);
71 for (int c = 0; c < depth; ++c)
72 {
73 output_data[(i * depth + c) * inner_size + j] =
74 (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
75 }
76 }
77 }
78}

References nnfw::cker::SoftmaxParams::axis, nnfw::cker::SoftmaxParams::beta, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), and output_shape.

Referenced by onert::backend::cpu::ops::LogSoftMaxLayer::logsoftmaxFloat32(), and onert::backend::cpu::ops::LogSoftMaxLayer::logsoftmaxQuant8().

◆ LogSoftmax() [2/2]

void nnfw::cker::LogSoftmax ( const SoftmaxParams params,
float  input_scale,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)
inline

Definition at line 80 of file LogSoftMax.h.

82{
83 const int rank = input_shape.DimensionsCount();
84 const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
85 const double beta = params.beta;
86 const int depth = MatchingDim(input_shape, axis, output_shape, axis);
87
88 const int32_t clamp_max = std::numeric_limits<uint8_t>::max();
89 const int32_t clamp_min = std::numeric_limits<uint8_t>::min();
90
91 int outer_size = 1;
92 for (int i = 0; i < axis; ++i)
93 {
94 outer_size *= input_shape.Dims(i);
95 }
96
97 int inner_size = 1;
98 for (int i = axis + 1; i < rank; ++i)
99 {
100 inner_size *= input_shape.Dims(i);
101 }
102
103 for (int i = 0; i < outer_size; ++i)
104 {
105 for (int j = 0; j < inner_size; ++j)
106 {
107 uint8_t max_val = std::numeric_limits<uint8_t>::min();
108 for (int c = 0; c < depth; ++c)
109 {
110 max_val = std::max(max_val, input_data[(i * depth + c) * inner_size]);
111 }
112
113 float sum_exp = 0.0f;
114 const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
115 const float *table_offset = &params.table[max_uint8 - max_val];
116 for (int c = 0; c < depth; ++c)
117 {
118 sum_exp += table_offset[input_data[(i * depth + c) * inner_size]];
119 }
120 const float log_sum_exp = std::log(sum_exp);
121
122 const float scale = input_scale / params.scale;
123 const float precomputed = (input_scale * max_val * beta + log_sum_exp) / params.scale;
124 for (int c = 0; c < depth; ++c)
125 {
126 const float log_prob =
127 scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
128 const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
129 output_data[(i * depth + c) * inner_size] =
130 static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
131 }
132 }
133 }
134}

References nnfw::cker::SoftmaxParams::axis, nnfw::cker::SoftmaxParams::beta, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), output_shape, nnfw::cker::SoftmaxParams::scale, nnfw::cker::SoftmaxParams::table, and nnfw::cker::SoftmaxParams::zero_point.

◆ LoopCondition()

bool nnfw::cker::LoopCondition ( int  index,
int  stop,
int  stride 
)
inline

Definition at line 187 of file StridedSlice.h.

188{
189 // True when we have reached the end of an axis and should loop.
190 return stride > 0 ? index >= stop : index <= stop;
191}

Referenced by StridedSlice().

◆ LstmStepFloat()

void nnfw::cker::LstmStepFloat ( const float *  input_ptr,
const float *  input_to_input_weights_ptr,
const float *  input_to_forget_weights_ptr,
const float *  input_to_cell_weights_ptr,
const float *  input_to_output_weights_ptr,
const float *  aux_input_ptr,
const float *  aux_input_to_input_weights_ptr,
const float *  aux_input_to_forget_weights_ptr,
const float *  aux_input_to_cell_weights_ptr,
const float *  aux_input_to_output_weights_ptr,
const float *  recurrent_to_input_weights_ptr,
const float *  recurrent_to_forget_weights_ptr,
const float *  recurrent_to_cell_weights_ptr,
const float *  recurrent_to_output_weights_ptr,
const float *  cell_to_input_weights_ptr,
const float *  cell_to_forget_weights_ptr,
const float *  cell_to_output_weights_ptr,
const float *  input_layer_norm_coefficients_ptr,
const float *  forget_layer_norm_coefficients_ptr,
const float *  cell_layer_norm_coefficients_ptr,
const float *  output_layer_norm_coefficients_ptr,
const float *  input_gate_bias_ptr,
const float *  forget_gate_bias_ptr,
const float *  cell_gate_bias_ptr,
const float *  output_gate_bias_ptr,
const float *  projection_weights_ptr,
const float *  projection_bias_ptr,
const LSTMParams params,
int  n_batch,
int  n_cell,
int  n_input,
int  n_aux_input,
int  n_output,
int  output_batch_leading_dim,
float *  output_state_ptr,
float *  cell_state_ptr,
float *  scratch0,
float *  scratch1,
float *  scratch2,
float *  scratch3,
float *  output_ptr 
)
inline

Definition at line 285 of file LSTM.h.

303{
304 // Since we have already checked that weights are all there or none, we can
305 // check the existence of only one to the get the condition.
306 const bool use_cifg = (input_to_input_weights_ptr == nullptr);
307
308 // Make named scratch buffers.
309 float *input_gate_scratch = scratch0;
310 float *forget_gate_scratch = scratch1;
311 float *cell_gate_scratch = scratch2;
312 float *output_gate_scratch = scratch3;
313
314 // Check if inputs are all zeros so we can skip some computations.
315 const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
316 const bool is_aux_input_all_zeros =
317 (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
318 if (!use_cifg)
319 {
320 // Calculate the input gate. (If not CIFG.)
321 CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
322 aux_input_to_input_weights_ptr, output_state_ptr,
323 recurrent_to_input_weights_ptr, cell_state_ptr,
324 cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
325 input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
326 /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
327 input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
328 }
329 // Calculate the forget gate.
330 CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
331 aux_input_to_forget_weights_ptr, output_state_ptr,
332 recurrent_to_forget_weights_ptr, cell_state_ptr,
333 cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
334 forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
335 /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
336 forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
337 // Calculate the cell update gate.
338 CalculateLstmGateFloat(
339 input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
340 output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
341 /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
342 n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
343 is_input_all_zeros, is_aux_input_all_zeros);
344 // Update the cell state.
345 UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
346 cell_gate_scratch, use_cifg, params->cell_clip);
347 // Calculate output gate.
348 CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr,
349 aux_input_to_output_weights_ptr, output_state_ptr,
350 recurrent_to_output_weights_ptr, cell_state_ptr,
351 cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
352 output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
353 /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
354 output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
355 // Update the output state.
356 CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
357 params->activation, projection_weights_ptr, projection_bias_ptr,
358 params->proj_clip, output_state_ptr, scratch2);
359 // Copy output state to the output. Note that the output's rows may not be
360 // contiguous (output_batch_leading_dim != n_output).
361 for (int b = 0; b < n_batch; b++)
362 {
363 std::copy_n(output_state_ptr + b * n_output, n_output,
364 output_ptr + b * output_batch_leading_dim);
365 }
366}
void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights, const float *aux_input, const float *aux_input_to_gate_weights, const float *output_state, const float *recurrent_to_gate_weights, const float *cell_state, const float *cell_to_gate_weights, const float *layer_norm_coefficients, const float *gate_bias, const int n_batch, const int n_input, const int n_aux_input, const int n_output, const int n_cell, const FusedActivationFunctionType activation, float *gate, const bool is_input_all_zeros, const bool is_aux_input_all_zeros)
Definition LSTM.h:62
bool IsZeroVector(const float *vector, int v_size)
FusedActivationFunctionType activation
Definition Types.h:293

References nnfw::cker::LSTMParams::activation, CalculateLstmGateFloat(), CalculateLstmOutputFloat(), nnfw::cker::LSTMParams::cell_clip, IsZeroVector(), kSigmoid, nnfw::cker::LSTMParams::proj_clip, and UpdateLstmCellFloat().

◆ MapAsMatrixWithLastDimAsRows()

template<typename Scalar >
MatrixMap< Scalar > nnfw::cker::MapAsMatrixWithLastDimAsRows ( Scalar *  data,
const Shape shape 
)

Definition at line 60 of file Utils.h.

61{
62 const int dims_count = shape.DimensionsCount();
63 const int rows = shape.Dims(dims_count - 1);
64 const int cols = FlatSizeSkipDim(shape, dims_count - 1);
65 return MatrixMap<Scalar>(data, rows, cols);
66}
typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, Eigen::Dynamic > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic > > >::type MatrixMap
Definition Utils.h:57

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and FlatSizeSkipDim().

Referenced by nnfw::cker::train::AveragePool2DGrad(), AveragePool< float >(), nnfw::cker::train::CategoricalCrossEntropy(), nnfw::cker::train::CategoricalCrossEntropyGrad(), nnfw::cker::train::FullyConnectedBiasGrad(), nnfw::cker::train::MaxPool2D(), nnfw::cker::train::MaxPool2DGrad(), MaxPool< float >(), nnfw::cker::train::MeanGrad(), and Softmax().

◆ MapAsVector()

template<typename Scalar >
VectorMap< Scalar > nnfw::cker::MapAsVector ( Scalar *  data,
const Shape shape 
)

Definition at line 43 of file Utils.h.

44{
45 const int size = shape.FlatSize();
46 return VectorMap<Scalar>(data, size, 1);
47}
typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, 1 > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, 1 > > >::type VectorMap
Definition Utils.h:41

References nnfw::cker::Shape::FlatSize(), and size.

Referenced by Abs(), nnfw::cker::train::BinaryArithmeticGrad(), nnfw::cker::train::CategoricalCrossEntropy(), Logistic(), ReLU(), ReLU6(), nnfw::cker::train::ReLU6Grad(), nnfw::cker::train::ReLUGrad(), and Tanh().

◆ MatchingDim() [1/2]

int nnfw::cker::MatchingDim ( const Shape shape1,
int  index1,
const Shape shape2,
int  index2 
)
inline

Definition at line 220 of file Shape.h.

222{
223 assert(shape1.Dims(index1) == shape2.Dims(index2));
224 return shape1.Dims(index1);
225}

References nnfw::cker::Shape::Dims().

Referenced by AveragePool16(), nnfw::cker::train::AveragePool2DGrad(), AveragePool32(), AveragePool< float >(), AveragePool< int8_t >(), nnfw::cker::train::backpropFilter(), nnfw::cker::train::backpropInput(), Concatenation(), nnfw::cker::reference::Conv(), nnfw::cker::multithreaded::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::train::ConvFilterGrad(), nnfw::cker::train::ConvInputGrad(), nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::reference_integer_ops::DepthwiseConvHybridPerChannel(), nnfw::cker::optimized::DepthwiseConvImpl(), DepthwiseConvOp(), nnfw::cker::reference_integer_ops::DepthwiseConvPerChannel(), nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding(), nnfw::cker::optimized::DepthwiseConvWithRounding(), nnfw::cker::optimized::DilatedIm2col(), FullyConnected(), FullyConnectedSparseWeight16x1(), FullyConnectedSparseWeightRandom(), nnfw::cker::reference::HybridConvPerChannel(), nnfw::cker::optimized::Im2col(), InstanceNorm(), L2NormalizeFloat32(), L2NormalizeQuant8(), LogSoftmax(), LogSoftmax(), MatchingDim(), nnfw::cker::train::MaxPool2D(), nnfw::cker::train::MaxPool2DGrad(), MaxPool< float >(), MaxPool< uint8_t >(), RankOneSelect(), ResizeBilinear(), ResizeBilinear(), ResizeBilinear(), RmsNorm(), RoPE(), nnfw::cker::reference::Softmax(), Softmax(), SplitV(), TransposeConv(), and nnfw::cker::reference::TransposeImpl().

◆ MatchingDim() [2/2]

template<typename... Args>
int nnfw::cker::MatchingDim ( const Shape shape1,
int  index1,
const Shape shape2,
int  index2,
Args...  args 
)

Definition at line 228 of file Shape.h.

230{
231 assert(shape1.Dims(index1) == shape2.Dims(index2));
232 return MatchingDim(shape1, index1, args...);
233}

References nnfw::cker::Shape::Dims(), and MatchingDim().

◆ MatchingElementsSize()

int nnfw::cker::MatchingElementsSize ( const Shape shape,
const Shape check_shape_0,
const Shape check_shape_1 
)
inline

Definition at line 333 of file Shape.h.

335{
336 const int size_1 = shape.FlatSize();
337 [[maybe_unused]] const int size_2 = check_shape_0.FlatSize();
338 [[maybe_unused]] const int size_3 = check_shape_1.FlatSize();
339 assert(size_1 == size_2);
340 assert(size_2 == size_3);
341 return size_1;
342}

References nnfw::cker::Shape::FlatSize().

Referenced by nnfw::cker::optimized::Add(), nnfw::cker::optimized::Add(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::optimized::Div(), nnfw::cker::optimized::Mul(), nnfw::cker::optimized::Mul(), and nnfw::cker::optimized::Sub().

◆ MatchingFlatSize()

template<typename... Ts>
int nnfw::cker::MatchingFlatSize ( const Shape shape,
Ts...  check_shapes 
)
inline

Definition at line 297 of file Shape.h.

298{
299 UNUSED_ALL{check_shapes...};
300 assert(checkMatching(shape, std::forward<Ts>(check_shapes)...));
301 return shape.FlatSize();
302}

References checkMatching(), and nnfw::cker::Shape::FlatSize().

Referenced by ComparisonImpl(), ComparisonWithScaling(), Cos(), Dequantize(), Dequantize(), Dequantize(), ELU(), Erf(), Exp(), Floor(), LeakyReLU(), Log(), LogicalNot(), Neg(), powImpl(), Quantize(), Quantize(), Quantize(), Quantize(), Round(), Rsqrt(), Select(), Sin(), Softmax(), nnfw::cker::train::SoftMaxGrad(), Sqrt(), and Square().

◆ MatchingFlatSizeSkipDim() [1/2]

int nnfw::cker::MatchingFlatSizeSkipDim ( const Shape shape,
int  skip_dim,
const Shape check_shape_0 
)
inline

Definition at line 304 of file Shape.h.

306{
307 const int dims_count = shape.DimensionsCount();
308 for (int i = 0; i < dims_count; ++i)
309 {
310 if (i != skip_dim)
311 {
312 assert(shape.Dims(i) == check_shape_0.Dims(i));
313 }
314 }
315 return FlatSizeSkipDim(shape, skip_dim);
316}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and FlatSizeSkipDim().

Referenced by L2NormalizeFloat32(), L2NormalizeQuant8(), MatchingFlatSizeSkipDim(), RankOneSelect(), nnfw::cker::reference::Softmax(), and Softmax().

◆ MatchingFlatSizeSkipDim() [2/2]

int nnfw::cker::MatchingFlatSizeSkipDim ( const Shape shape,
int  skip_dim,
const Shape check_shape_0,
const Shape check_shape_1 
)
inline

Definition at line 318 of file Shape.h.

321{
322 const int dims_count = shape.DimensionsCount();
323 for (int i = 0; i < dims_count; ++i)
324 {
325 if (i != skip_dim)
326 {
327 assert(shape.Dims(i) == check_shape_0.Dims(i));
328 }
329 }
330 return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
331}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and MatchingFlatSizeSkipDim().

◆ MatrixBandPart()

template<typename T >
void nnfw::cker::MatrixBandPart ( const T  num_lower_diags,
const T  num_upper_diags,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 30 of file MatrixBandPart.h.

32{
33 auto last_dim = input_shape.DimensionsCount() - 1;
34
35 T batch_num = 1;
36 for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++)
37 {
38 batch_num *= input_shape.Dims(dim);
39 }
40
41 const T row_num = input_shape.Dims(last_dim - 1);
42 const T col_num = input_shape.Dims(last_dim);
43
44 if (!(num_lower_diags <= row_num))
45 throw std::runtime_error(
46 "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
47
48 if (!(num_upper_diags <= col_num))
49 throw std::runtime_error(
50 "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
51
52 std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
53
54 // reference code, without multithreading
55 for (T batch = 0; batch < batch_num; ++batch)
56 {
57 for (T row = 0; row < row_num; ++row)
58 {
59 auto output = output_data + (batch * row_num * col_num + row * col_num);
60 auto input = input_data + (batch * row_num * col_num + row * col_num);
61
62 const T band_start =
63 num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
64 const T band_end = num_upper_diags < 0
65 ? col_num
66 : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
67
68 for (T band_idx = band_start; band_idx < band_end; band_idx++)
69 {
70 output[band_idx] = input[band_idx];
71 }
72 }
73 }
74}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ MatrixBatchVectorMultiplyAccumulate() [1/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate ( const float *  matrix,
int  m_rows,
int  m_cols,
const float *  vector,
int  n_batch,
float *  result,
int  result_stride 
)
inline

Definition at line 136 of file TensorUtils.h.

139{
140 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch,
141 result, result_stride);
142}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

◆ MatrixBatchVectorMultiplyAccumulate() [2/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate ( const int8_t *  matrix,
const int  m_rows,
const int  m_cols,
const int8_t *  vector,
const float *  scaling_factors,
int  n_batch,
float *  result,
int  result_stride 
)
inline

Definition at line 127 of file TensorUtils.h.

131{
132 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector,
133 scaling_factors, n_batch, result, result_stride);
134}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), MatrixBatchVectorMultiplyAccumulate(), MatrixBatchVectorMultiplyAccumulate(), and MatrixBatchVectorMultiplyAccumulate().

◆ MatrixBatchVectorMultiplyAccumulate() [3/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate ( const int8_t *  matrix,
const int  m_rows,
const int  m_cols,
const int8_t *  vectors,
const float *  scaling_factors,
int  n_batch,
int32_t *  scratch,
float *  result,
int  result_stride,
ruy::Context *  ruy_context 
)
inline

Definition at line 144 of file TensorUtils.h.

149{
150 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
151 scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
152}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

◆ Max()

template<typename T >
void nnfw::cker::Max ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 82 of file MaxMin.h.

85{
86 MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
87 input2_data, unextended_output_shape, output_data,
88 MaximumOp::template op<T>);
89}

◆ MaximumMinimumBroadcast4DSlow()

template<typename T , typename Op >
void nnfw::cker::MaximumMinimumBroadcast4DSlow ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data,
Op  op 
)
inline

Definition at line 47 of file MaxMin.h.

50{
51 assert(unextended_input1_shape.DimensionsCount() <= 4);
52 assert(unextended_input2_shape.DimensionsCount() <= 4);
53 assert(unextended_output_shape.DimensionsCount() <= 4);
54 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
55
58 NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
59 &desc2);
60
61 for (int b = 0; b < output_shape.Dims(0); ++b)
62 {
63 for (int y = 0; y < output_shape.Dims(1); ++y)
64 {
65 for (int x = 0; x < output_shape.Dims(2); ++x)
66 {
67 for (int c = 0; c < output_shape.Dims(3); ++c)
68 {
69 auto out_idx = Offset(output_shape, b, y, x, c);
70 auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
71 auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
72 auto in1_val = input1_data[in1_idx];
73 auto in2_val = input2_data[in2_idx];
74 output_data[out_idx] = op(in1_val, in2_val);
75 }
76 }
77 }
78 }
79}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ MaxPool()

template<typename T >
void nnfw::cker::MaxPool ( const PoolParams ,
const Shape ,
const T *  ,
const Shape ,
T *   
)

Definition at line 34 of file MaxPool.h.

35{
36 static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
37 "cker::MaxPool : This function supports only integer or floating point");
38 throw std::runtime_error("cker::MaxPool : Unsupported data type");
39}

◆ MaxPool< float >()

template<>
void nnfw::cker::MaxPool< float > ( const PoolParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 42 of file MaxPool.h.

44{
45 assert(input_shape.DimensionsCount() == 4);
46 assert(output_shape.DimensionsCount() == 4);
47 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
48 const int input_height = input_shape.Dims(1);
49 const int input_width = input_shape.Dims(2);
50 const int output_height = output_shape.Dims(1);
51 const int output_width = output_shape.Dims(2);
52 const int stride_height = params.stride_height;
53 const int stride_width = params.stride_width;
54
55 const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
56 auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
57 // Prefill the output to minimum representable float value
58 out_mat.setConstant(std::numeric_limits<float>::lowest());
59 for (int b = 0; b < batches; ++b)
60 {
61 for (int h = 0; h < input_height; ++h)
62 {
63 for (int w = 0; w < input_width; ++w)
64 {
65 // (h_start, h_end) * (w_start, w_end) is the range that the input
66 // vector projects to.
67 int hpad = h + params.padding_values.height;
68 int wpad = w + params.padding_values.width;
69 int h_start =
70 (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
71 int h_end = std::min(hpad / stride_height + 1, output_height);
72 int w_start =
73 (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
74 int w_end = std::min(wpad / stride_width + 1, output_width);
75 // compute elementwise sum
76 for (int ph = h_start; ph < h_end; ++ph)
77 {
78 for (int pw = w_start; pw < w_end; ++pw)
79 {
80 int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
81 out_mat.col(out_offset) =
82 out_mat.col(out_offset)
83 .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
84 }
85 }
86 }
87 }
88 }
89 const int flat_size = output_shape.FlatSize();
90 for (int i = 0; i < flat_size; ++i)
91 {
94 }
95}

References ActivationFunctionWithMinMax(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PoolParams::float_activation_max, nnfw::cker::PoolParams::float_activation_min, nnfw::cker::PaddingValues::height, MapAsMatrixWithLastDimAsRows(), MatchingDim(), NodeOffset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ MaxPool< uint8_t >()

template<>
void nnfw::cker::MaxPool< uint8_t > ( const PoolParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)

Definition at line 98 of file MaxPool.h.

100{
101
102 // Here, and in other pooling ops, in order to maintain locality of reference,
103 // to minimize some recalculations, and to load into NEON vector registers, we
104 // use an inner loop down the depth. Since depths can be large and hence we
105 // would need arbitrarily large temporary storage, we divide the work up into
106 // depth tranches just within the batch loop.
107 static constexpr int kPoolingAccTrancheSize = 256;
108
109 assert(params.quantized_activation_min <= params.quantized_activation_max);
110 assert(input_shape.DimensionsCount() == 4);
111 assert(output_shape.DimensionsCount() == 4);
112 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
113 const int depth = MatchingDim(input_shape, 3, output_shape, 3);
114 const int input_height = input_shape.Dims(1);
115 const int input_width = input_shape.Dims(2);
116 const int output_height = output_shape.Dims(1);
117 const int output_width = output_shape.Dims(2);
118 const int stride_height = params.stride_height;
119 const int stride_width = params.stride_width;
120
121 uint8_t acc[kPoolingAccTrancheSize];
122 for (int batch = 0; batch < batches; ++batch)
123 {
124 // We proceed through the depth in tranches (see comment above). The
125 // depth_base is the depth at the beginning of the tranche. The
126 // tranche_depth is the depth dimension of the tranche.
127 for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
128 {
129 const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
130 for (int out_y = 0; out_y < output_height; ++out_y)
131 {
132 for (int out_x = 0; out_x < output_width; ++out_x)
133 {
134 const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
135 const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
136 const int filter_x_start = std::max(0, -in_x_origin);
137 const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
138 const int filter_y_start = std::max(0, -in_y_origin);
139 const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
140 memset(acc, 0, tranche_depth * sizeof(acc[0]));
141 const uint8_t *input_ptr =
142 input_data + depth_base +
143 depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
144 for (int fy = filter_y_start; fy < filter_y_end; fy++)
145 {
146 const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
147 for (int fx = filter_x_start; fx < filter_x_end; fx++)
148 {
149 const uint8_t *input_channel_ptr = input_row_ptr;
150 int channel = 0;
151#ifdef USE_NEON
152 for (; channel <= tranche_depth - 16; channel += 16)
153 {
154 uint8x16_t acc_reg = vld1q_u8(acc + channel);
155 uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
156 input_channel_ptr += 16;
157 acc_reg = vmaxq_u8(acc_reg, input_reg);
158 vst1q_u8(acc + channel, acc_reg);
159 }
160
161 for (; channel <= tranche_depth - 8; channel += 8)
162 {
163 uint8x8_t acc_reg = vld1_u8(acc + channel);
164 uint8x8_t input_reg = vld1_u8(input_channel_ptr);
165 input_channel_ptr += 8;
166 acc_reg = vmax_u8(acc_reg, input_reg);
167 vst1_u8(acc + channel, acc_reg);
168 }
169#endif
170 for (; channel < tranche_depth; ++channel)
171 {
172 acc[channel] = std::max(acc[channel], *input_channel_ptr++);
173 }
174 input_row_ptr += depth;
175 }
176 }
177 uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
178 int channel = 0;
179#ifdef USE_NEON
180 for (; channel <= tranche_depth - 16; channel += 16)
181 {
182 uint8x16_t a = vld1q_u8(acc + channel);
183 a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
184 a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
185 vst1q_u8(output_ptr + channel, a);
186 }
187 for (; channel <= tranche_depth - 8; channel += 8)
188 {
189 uint8x8_t a = vld1_u8(acc + channel);
190 a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
191 a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
192 vst1_u8(output_ptr + channel, a);
193 }
194#endif
195 for (; channel < tranche_depth; ++channel)
196 {
197 uint8_t a = acc[channel];
198 a = std::max<uint8_t>(a, params.quantized_activation_min);
199 a = std::min<uint8_t>(a, params.quantized_activation_max);
200 output_ptr[channel] = static_cast<uint8_t>(a);
201 }
202 }
203 }
204 }
205 }
206}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ Mean()

template<typename In , typename Out >
void nnfw::cker::Mean ( const Shape input_shape,
const In *  input_data,
const Shape output_shape,
Out *  output_data,
const std::vector< int > &  axes 
)

Definition at line 211 of file ReduceMean.h.

213{
214 assert(input_shape.DimensionsCount() > 0);
215 ReduceMean m_obj;
216 m_obj.ReduceOp<In, Out>(input_shape, input_data, output_shape, output_data, axes, true, (Out)0,
217 mean_reducer);
218}
bool ReduceOp(const Shape &input_shape, const In *input_data, const Shape &output_shape, Out *output_data, const std::vector< int > &axes, bool, Out init_value, Out reducer(const Out current, const Out in, int normalizer))
Definition ReduceMean.h:152

References nnfw::cker::Shape::DimensionsCount(), mean_reducer(), output_shape, and nnfw::cker::ReduceMean::ReduceOp().

Referenced by onert::backend::cpu::ops::MeanLayer::MeanFloat32().

◆ mean_reducer()

template<typename Out , typename In >
Out nnfw::cker::mean_reducer ( const Out  data1,
const In  data2,
int  normalizer 
)

Definition at line 41 of file ReduceMean.h.

42{
43 return data1 + static_cast<Out>(data2) / normalizer;
44}

Referenced by Mean().

◆ MeanAxis1And2()

template<typename In , typename Out >
void nnfw::cker::MeanAxis1And2 ( const Shape input_shape,
const In *  input_data,
const Shape output_shape,
Out *  output_data 
)

Definition at line 233 of file ReduceMean.h.

235{
236 assert(input_shape.DimensionsCount() == 4);
237 assert(output_shape.DimensionsCount() == 4);
238
239 const int output_batch = output_shape.Dims(0);
240 const int output_depth = output_shape.Dims(3);
241
242 const int input_height = input_shape.Dims(1);
243 const int input_width = input_shape.Dims(2);
244
245 for (int out_b = 0; out_b < output_batch; ++out_b)
246 {
247 for (int out_d = 0; out_d < output_depth; ++out_d)
248 {
249 float value = 0;
250 for (int in_h = 0; in_h < input_height; ++in_h)
251 {
252 for (int in_w = 0; in_w < input_width; ++in_w)
253 {
254 value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
255 }
256 }
257 output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
258 }
259 }
260}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

Referenced by onert::backend::cpu::ops::MeanLayer::MeanFloat32().

◆ MeanQ8Asymm()

template<typename In , typename Out >
void nnfw::cker::MeanQ8Asymm ( const Shape input_shape,
const In *  input_data,
float  input_scale,
int32_t  input_offset,
const Shape output_shape,
Out *  output_data,
float  output_scale,
int32_t  output_offset,
const std::vector< int > &  axes 
)

Definition at line 221 of file ReduceMean.h.

224{
225 assert(input_shape.DimensionsCount() > 0);
226 ReduceMean m_obj;
227 m_obj.ReduceOp<In, Out>(input_shape, input_data, input_scale, input_offset, output_shape,
228 output_data, output_scale, output_offset, axes, true, (Out)0,
229 sum_reducer);
230}

References nnfw::cker::Shape::DimensionsCount(), output_shape, nnfw::cker::ReduceMean::ReduceOp(), and sum_reducer().

Referenced by onert::backend::cpu::ops::MeanLayer::MeanQuant8().

◆ MeanStddevNormalization()

void nnfw::cker::MeanStddevNormalization ( const float *  input_vector,
float *  output_vector,
int  v_size,
int  n_batch 
)
inline

Definition at line 154 of file TensorUtils.h.

156{
157 PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
158}
void PortableMeanStddevNormalization(const float *input_vector, float *output_vector, int v_size, int n_batch)

References PortableMeanStddevNormalization().

Referenced by CalculateLstmGateFloat().

◆ Min()

template<typename T >
void nnfw::cker::Min ( const Shape unextended_input1_shape,
const T *  input1_data,
const Shape unextended_input2_shape,
const T *  input2_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 92 of file MaxMin.h.

95{
96 MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
97 input2_data, unextended_output_shape, output_data,
98 MinimumOp::template op<T>);
99}

◆ MultiplyByQuantizedMultiplier()

int32_t nnfw::cker::MultiplyByQuantizedMultiplier ( int32_t  x,
int32_t  quantized_multiplier,
int  shift 
)
inline

Definition at line 96 of file Utils.h.

97{
98 int left_shift = shift > 0 ? shift : 0;
99 int right_shift = shift > 0 ? 0 : -shift;
100 return gemmlowp::RoundingDivideByPOT(
101 gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
102 right_shift);
103}

Referenced by nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::reference_integer_ops::DepthwiseConvPerChannel(), FullyConnected(), nnfw::cker::optimized::MulElementwise(), nnfw::cker::optimized::MulElementwise(), nnfw::cker::optimized::MulSimpleBroadcast(), nnfw::cker::optimized::quant8_mul(), Quantize(), Requantize< int8_t, uint8_t >(), and Requantize< uint8_t, int8_t >().

◆ MultiplyByQuantizedMultiplierGreaterThanOne()

int32_t nnfw::cker::MultiplyByQuantizedMultiplierGreaterThanOne ( int32_t  x,
int32_t  quantized_multiplier,
int  left_shift 
)
inline

Definition at line 105 of file Utils.h.

107{
108 return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
109}

◆ MultiplyByQuantizedMultiplierSmallerThanOneExp()

int32_t nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp ( int32_t  x,
int32_t  quantized_multiplier,
int  left_shift 
)
inline

Definition at line 111 of file Utils.h.

114{
115 return gemmlowp::RoundingDivideByPOT(
116 gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
117}

Referenced by nnfw::cker::optimized::AddElementwise(), nnfw::cker::optimized::AddElementwise(), nnfw::cker::optimized::AddScalarBroadcast(), BroadcastComparison4DSlowWithScaling(), ComparisonWithScaling(), L2NormalizeQuant8(), and nnfw::cker::optimized::quant8_sum().

◆ MultithreadAlongBatches()

bool nnfw::cker::MultithreadAlongBatches ( int  thread_count,
int  batches 
)
inline

Definition at line 95 of file DepthwiseConv.h.

96{
97 assert(thread_count >= 2);
98 // If there are fewer batch entries than the number of threads we want to use,
99 // then better do intra-batch-entry multithreading.
100 if (batches < thread_count)
101 {
102 return false;
103 }
104 // If there are at least 2 batch entries to be handed to each thread, then
105 // it's safe to proceed with batch-wise multithreading: each thread will have
106 // approximately equal number of batch entries to handle, so the load
107 // balancing will be reasonable, and the amount to which the load is not
108 // perfectly balanced will be offset by the inherent advantages of
109 // batch-wise multithreading (each thread is more efficient thanks to working
110 // on larger buffers with less boundary-handling overhead).
111 if (batches >= 2 * thread_count)
112 {
113 return true;
114 }
115 // In the limit case were there are at least 1 but not much more than 1
116 // batch entries per thread, it may be a good idea to do per-batch
117 // multithreading if the number of batch entries is a multiple of the number
118 // of threads, so that each thread will have the same number of batch entries
119 // to process.
120 return ((batches % thread_count) == 0);
121}

Referenced by DepthwiseConv().

◆ NdArrayDescsForElementwiseBroadcast() [1/2]

template<int N>
void nnfw::cker::NdArrayDescsForElementwiseBroadcast ( const Shape input0_shape,
const Shape input1_shape,
const Shape input2_shape,
NdArrayDesc< N > *  desc0_out,
NdArrayDesc< N > *  desc1_out,
NdArrayDesc< N > *  desc2_out 
)
inline

Definition at line 329 of file Utils.h.

332{
333 assert(desc0_out != nullptr);
334 assert(desc1_out != nullptr);
335 assert(desc2_out != nullptr);
336
337 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
338 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
339 auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
340
341 // Copy dims to desc, calculating strides.
342 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
343 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
344 CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
345
346 // Walk over each dimension. If the extents are equal do nothing.
347 // Otherwise, set the desc with extent 1 to have extent equal to the other and
348 // stride 0.
349 for (int i = 0; i < N; ++i)
350 {
351 const int extent0 = extended_input0_shape.Dims(i);
352 const int extent1 = extended_input1_shape.Dims(i);
353 const int extent2 = extended_input2_shape.Dims(i);
354
355 int extent = extent0;
356 if (extent1 != 1)
357 extent = extent1;
358 if (extent2 != 1)
359 extent = extent2;
360
361 assert(extent0 == 1 || extent0 == extent);
362 assert(extent1 == 1 || extent1 == extent);
363 assert(extent2 == 1 || extent2 == extent);
364
365 if (!(extent0 == extent1 && extent1 == extent2))
366 {
367 if (extent0 == 1)
368 {
369 desc0_out->strides[i] = 0;
370 desc0_out->extents[i] = extent;
371 }
372 if (extent1 == 1)
373 {
374 desc1_out->strides[i] = 0;
375 desc1_out->extents[i] = extent;
376 }
377 if (extent2 == 1)
378 {
379 desc2_out->strides[i] = 0;
380 desc2_out->extents[i] = extent;
381 }
382 }
383 }
384}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

◆ NdArrayDescsForElementwiseBroadcast() [2/2]

template<int N>
void nnfw::cker::NdArrayDescsForElementwiseBroadcast ( const Shape input0_shape,
const Shape input1_shape,
NdArrayDesc< N > *  desc0_out,
NdArrayDesc< N > *  desc1_out 
)
inline

Definition at line 290 of file Utils.h.

292{
293 assert(desc0_out != nullptr);
294 assert(desc1_out != nullptr);
295
296 auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
297 auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
298
299 // Copy dims to desc, calculating strides.
300 CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
301 CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
302
303 // Walk over each dimension. If the extents are equal do nothing.
304 // Otherwise, set the desc with extent 1 to have extent equal to the other and
305 // stride 0.
306 for (int i = 0; i < N; ++i)
307 {
308 const int extent0 = extended_input0_shape.Dims(i);
309 const int extent1 = extended_input1_shape.Dims(i);
310 if (extent0 != extent1)
311 {
312 if (extent0 == 1)
313 {
314 desc0_out->strides[i] = 0;
315 desc0_out->extents[i] = extent1;
316 }
317 else
318 {
319 assert(extent1 == 1);
320 desc1_out->strides[i] = 0;
321 desc1_out->extents[i] = extent0;
322 }
323 }
324 }
325}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

Referenced by nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), BroadcastComparison4DSlowImpl(), BroadcastComparison4DSlowWithScaling(), BroadcastSelect4DSlow(), FloorDivBroadcast(), FloorModBroadcast(), LogicalAndBroadcast(), LogicalOrBroadcast(), and MaximumMinimumBroadcast4DSlow().

◆ Neg()

template<typename T >
void nnfw::cker::Neg ( const Shape input_shape,
const T *  input_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 70 of file Elementwise.h.

72{
73 const int size = MatchingFlatSize(input_shape, output_shape);
74 for (int i = 0; i < size; i++)
75 {
76 output_data[i] = -input_data[i];
77 }
78}

References MatchingFlatSize(), output_shape, and size.

◆ NextIndex()

bool nnfw::cker::NextIndex ( const int  num_dims,
const int *  dims,
int *  current 
)
inline

Definition at line 387 of file Utils.h.

388{
389 if (num_dims == 0)
390 {
391 return false;
392 }
393 assert(dims != nullptr);
394 assert(current != nullptr);
395 int carry = 1;
396 for (int idx = num_dims - 1; idx >= 0; --idx)
397 {
398 int current_val = current[idx] + carry;
399 assert(dims[idx] >= current_val);
400 if (dims[idx] == current_val)
401 {
402 current[idx] = 0;
403 }
404 else
405 {
406 current[idx] = current_val;
407 carry = 0;
408 break;
409 }
410 }
411 return (carry == 0);
412}

Referenced by ReduceImpl(), ReduceMeanImpl(), ReduceSumQuantImpl(), and SqDiffImpl().

◆ NodeOffset()

int nnfw::cker::NodeOffset ( int  b,
int  h,
int  w,
int  height,
int  width 
)
inline

Definition at line 147 of file Utils.h.

148{
149 return (b * height + h) * width + w;
150}

Referenced by nnfw::cker::train::AveragePool2DGrad(), AveragePool< float >(), nnfw::cker::train::MaxPool2D(), and MaxPool< float >().

◆ NotEqualFn()

template<typename T >
bool nnfw::cker::NotEqualFn ( lhs,
rhs 
)
inline

Definition at line 31 of file Comparison.h.

31{ return lhs != rhs; }

◆ Offset() [1/2]

int nnfw::cker::Offset ( const Shape shape,
int *  index 
)
inline

Definition at line 248 of file Shape.h.

249{
250 return Offset(shape, index[0], index[1], index[2], index[3]);
251}

References Offset().

◆ Offset() [2/2]

int nnfw::cker::Offset ( const Shape shape,
int  i0,
int  i1,
int  i2,
int  i3 
)
inline

Definition at line 237 of file Shape.h.

238{
239 assert(shape.DimensionsCount() == 4);
240 const int *dims_data = shape.DimsDataUpTo4D();
241 assert(i0 >= 0 && i0 < dims_data[0]);
242 assert(i1 >= 0 && i1 < dims_data[1]);
243 assert(i2 >= 0 && i2 < dims_data[2]);
244 assert(i3 >= 0 && i3 < dims_data[3]);
245 return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
246}
const int32_t * DimsDataUpTo4D() const
Definition Shape.h:115

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsDataUpTo4D().

Referenced by AveragePool16(), AveragePool32(), AveragePool< int8_t >(), BatchToSpaceND(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), BroadcastComparison4DSlowImpl(), BroadcastComparison4DSlowWithScaling(), BroadcastSelect4DSlow(), nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), DepthToSpace(), nnfw::cker::reference_integer_ops::DepthwiseConvHybridPerChannel(), nnfw::cker::reference_integer_ops::DepthwiseConvPerChannel(), nnfw::cker::optimized::DilatedIm2col(), nnfw::cker::optimized::ExtractPatchIntoBufferColumn(), FloorDivBroadcast(), FloorModBroadcast(), nnfw::cker::reference::HybridConvPerChannel(), InstanceNorm(), LogicalAndBroadcast(), LogicalOrBroadcast(), MaximumMinimumBroadcast4DSlow(), MaxPool< uint8_t >(), MeanAxis1And2(), Offset(), ResizeBilinear(), ResizeBilinearGeneric(), ResizeBilinearGenericSmallChannel(), ResizeBilinearKernel2x2(), RmsNorm(), RoPE(), Slice(), SpaceToBatchND(), SpaceToDepth(), StridedSlice(), TransposeConv(), and nnfw::cker::reference::TransposeImpl().

◆ OneHot()

template<typename T , typename TI >
void nnfw::cker::OneHot ( const int32_t  depth,
const T  on_value,
const T  off_value,
int32_t  axis,
const Shape indices_shape,
const TI *  indices_data,
const Shape ,
T *  output_data 
)

Definition at line 29 of file OneHot.h.

31{
32 if (axis == -1)
33 axis = indices_shape.DimensionsCount();
34
35 // prefix_dim_size == # of elements before the axis
36 // depth == # of elements per axis
37 // suffix_dim_size == # of elements after the axis
38 int prefix_dim_size = 1;
39 for (int i = 0; i < axis; ++i)
40 {
41 prefix_dim_size *= indices_shape.Dims(i);
42 }
43 const int suffix_dim_size = indices_shape.FlatSize() / prefix_dim_size;
44
45 // View the indices as a matrix of size:
46 // prefix_dim_size x suffix_dim_size
47 // View the output as a matrix of size:
48 // prefix_dim_size x depth x suffix_dim_size
49 // Then the output is:
50 // output(i, j, k) == (indices(i, k) == j) ? on : off
51 for (int i = 0; i < prefix_dim_size; ++i)
52 {
53 for (int j = 0; j < depth; ++j)
54 {
55 for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
56 {
57 *output_data =
58 static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
59 }
60 }
61 }
62}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

◆ operator<<()

std::ostream & nnfw::cker::operator<< ( std::ostream &  os,
const Shape shape 
)
inline

Definition at line 486 of file Utils.h.

487{
488 using std::begin;
489 using std::end;
490
491 std::string formatted =
492 std::accumulate(begin(shape), end(shape), std::string{"["},
493 [](std::string joined, ShapeIterator::value_type dim) {
494 return std::move(joined).append(std::to_string(dim)).append(",");
495 });
496
497 if (formatted.back() == '[')
498 {
499 formatted.push_back(']');
500 }
501 else
502 {
503 formatted.back() = ']';
504 }
505
506 os << formatted;
507 return os;
508}

References begin, and end().

◆ optimized_ops_preload_l1_keep()

template<typename T >
void nnfw::cker::optimized_ops_preload_l1_keep ( const T *  ptr)

Definition at line 455 of file Utils.h.

456{
457#ifdef __GNUC__
458 // builtin offered by GCC-compatible compilers including clang
459 __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
460#else
461 (void)ptr;
462#endif
463}

Referenced by Transpose2D().

◆ Pack()

template<typename Scalar >
void nnfw::cker::Pack ( const PackParams params,
const Scalar *const *  input_data,
const Shape output_shape,
Scalar *  output_data 
)
inline

Definition at line 30 of file Pack.h.

32{
33 const int dimensions = output_shape.DimensionsCount();
34 int axis = params.axis;
35 int inputs_count = params.inputs_count;
36
37 int outer_size = 1;
38 for (int i = 0; i < axis; i++)
39 {
40 outer_size *= output_shape.Dims(i);
41 }
42 int copy_size = 1;
43 for (int i = params.axis + 1; i < dimensions; i++)
44 {
45 copy_size *= output_shape.Dims(i);
46 }
47
48 for (int i = 0; i < inputs_count; ++i)
49 {
50 for (int k = 0; k < outer_size; k++)
51 {
52 const Scalar *input_ptr = input_data[i] + copy_size * k;
53 int loc = k * inputs_count * copy_size + i * copy_size;
54 memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
55 }
56 }
57}
uint16_t inputs_count
Definition Types.h:129

References nnfw::cker::PackParams::axis, nnfw::cker::PackParams::inputs_count, and output_shape.

◆ Pad()

template<typename T >
void nnfw::cker::Pad ( const int32_t *  padding_data,
int32_t  pad_rank,
const Shape input_shape,
const T *  input_data,
const Shape output_shape,
T *  output_data,
const T *  constant_value_data 
)
inline

List of padding information

Definition at line 30 of file Pad.h.

33{
34 // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
35 // TODO: come up with more subtle solution that uses subtensors like arm compute
36 // TODO: Check if it works for all layouts
37
38 using PaddingInfo = std::pair<int32_t, int32_t>;
40 using PaddingList = std::vector<PaddingInfo>;
41
42 const T constant_value = constant_value_data ? *constant_value_data : 0;
43 assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
44
45 PaddingList padding_list(pad_rank);
46 for (int32_t n = 0; n < pad_rank; ++n)
47 {
48 const int32_t *from = padding_data + (n * 2);
49 padding_list[n] = {from[0], from[1]};
50 }
51 for (int32_t i = 0; i < pad_rank; ++i)
52 {
53 assert(output_shape.Dims(i) ==
54 input_shape.Dims(i) + padding_list[i].first + padding_list[i].second);
55 }
56 /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker
57 functions:
58 1. to prevent access violation in padding_list;
59 2. handling as 4d is slower than as 2d/3d.
60 */
61 switch (pad_rank)
62 {
63 case 0:
64 case 1:
65 {
66 const int32_t in_row_len = input_shape.Dims(0);
67 std::fill_n(output_data, padding_list[0].first, constant_value);
68 std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
69 std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
70 constant_value);
71 break;
72 }
73 case 2: // HW
74 {
75 const int32_t in_row_len = input_shape.Dims(1);
76 const int32_t out_row_size = output_shape.Dims(1);
77
78 // prepend padding rows
79 std::fill_n(output_data, padding_list[0].first * out_row_size, constant_value);
80
81 const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
82 for (auto i = padding_list[0].first, j = 0; i < r_h_inp_lim; ++i, ++j)
83 {
84 auto out_offset = i * out_row_size;
85 const auto in_offset = j * in_row_len;
86
87 // prepend padding values
88 std::fill_n(output_data + out_offset, padding_list[1].first, constant_value);
89
90 out_offset += padding_list[1].first;
91
92 // copy a row of input data
93 memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
94
95 out_offset += in_row_len;
96
97 // append padding values
98 std::fill_n(output_data + out_offset, padding_list[1].second, constant_value);
99 }
100
101 // append padding rows
102 std::fill_n(output_data + r_h_inp_lim * out_row_size, padding_list[0].second * out_row_size,
103 constant_value);
104 break;
105 }
106 case 3: // HWC
107 {
108 const int32_t in_row_len = input_shape.Dims(2);
109 const int32_t out_row_size = output_shape.Dims(2);
110 const auto plain_size = out_row_size * output_shape.Dims(1);
111
112 // prepend padding plains
113 std::fill_n(output_data, padding_list[0].first * plain_size, constant_value);
114
115 const auto r_h_inp_lim = input_shape.Dims(0) + padding_list[0].first;
116 for (auto i = padding_list[0].first, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp)
117 {
118 const auto out_w_offset = (i * output_shape.Dims(1) + 0) * output_shape.Dims(2);
119
120 // prepend padding rows
121 std::fill_n(output_data + out_w_offset, padding_list[1].first * out_row_size,
122 constant_value);
123
124 const auto r_w_inp_lim = input_shape.Dims(1) + padding_list[1].first;
125 for (auto j = padding_list[1].first, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp)
126 {
127 auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2);
128 const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2);
129
130 // prepend padding values
131 std::fill_n(output_data + out_offset, padding_list[2].first, constant_value);
132
133 out_offset += padding_list[2].first;
134
135 // copy a row of input data
136 memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
137
138 out_offset += in_row_len;
139
140 // append padding values
141 std::fill_n(output_data + out_offset, padding_list[2].second, constant_value);
142 }
143
144 // append padding rows
145 std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
146 padding_list[1].second * out_row_size, constant_value);
147 }
148
149 // append padding plains
150 std::fill_n(output_data + r_h_inp_lim * plain_size, padding_list[0].second * plain_size,
151 constant_value);
152 break;
153 }
154 case 4:
155 {
156 auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t {
157 return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3);
158 };
159 const int32_t in_row_len = input_shape.Dims(3);
160 const int32_t out_row_size = output_shape.Dims(3);
161 const auto plain_size = out_row_size * output_shape.Dims(2);
162 const auto parallelepiped_size = plain_size * output_shape.Dims(1);
163
164 // prepend padding parallelepipeds
165 std::fill_n(output_data, padding_list[0].first * parallelepiped_size, constant_value);
166
167 const auto r_b_inp_lim = input_shape.Dims(0) + padding_list[0].first;
168 for (auto i = padding_list[0].first, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp)
169 {
170 const auto out_h_offset = get_offset(output_shape, i, 0, 0);
171 // prepend padding plains
172 std::fill_n(output_data + out_h_offset, padding_list[1].first * plain_size, constant_value);
173
174 const auto r_h_inp_lim = input_shape.Dims(1) + padding_list[1].first;
175 for (auto j = padding_list[1].first, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp)
176 {
177 const auto out_w_offset = get_offset(output_shape, i, j, 0);
178
179 // prepend padding rows
180 std::fill_n(output_data + out_w_offset, padding_list[2].first * out_row_size,
181 constant_value);
182
183 const auto r_w_inp_lim = input_shape.Dims(2) + padding_list[2].first;
184 for (auto k = padding_list[2].first, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp)
185 {
186 auto out_c_offset = get_offset(output_shape, i, j, k);
187 const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp);
188
189 // prepend padding values
190 std::fill_n(output_data + out_c_offset, padding_list[3].first, constant_value);
191
192 out_c_offset += padding_list[3].first;
193
194 // copy a row of input data
195 memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
196
197 out_c_offset += in_row_len;
198
199 // append padding values
200 std::fill_n(output_data + out_c_offset, padding_list[3].second, constant_value);
201 }
202
203 // append padding rows
204 std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
205 padding_list[2].second * out_row_size, constant_value);
206 }
207
208 // append padding plains
209 std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size,
210 padding_list[1].second * plain_size, constant_value);
211 }
212 // append padding parallelepipeds
213 std::fill_n(output_data + r_b_inp_lim * parallelepiped_size,
214 padding_list[0].second * parallelepiped_size, constant_value);
215 break;
216 }
217 default:
218 throw std::runtime_error("Padding for rank > 4 NYI");
219 break;
220 }
221}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ PopulateSoftmaxLookupTable()

void nnfw::cker::PopulateSoftmaxLookupTable ( float *  table,
float  input_scale,
float  beta 
)
inline

Definition at line 148 of file SoftMax.h.

149{
150 const float scale = -input_scale * beta;
151 const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
152 for (int32_t val = 0; val <= max_uint8; ++val)
153 {
154 table[max_uint8 - val] = expf(scale * val);
155 }
156}

Referenced by onert::backend::cpu::ops::SoftMaxLayer::configure().

◆ PortableApplyActivationToVector()

void nnfw::cker::PortableApplyActivationToVector ( const float *  vector,
int  v_size,
FusedActivationFunctionType  activation,
float *  result 
)
inline

Definition at line 103 of file PortableTensorUtils.h.

105{
106 auto activation_func = ActivationFunctor(activation);
107 for (int v = 0; v < v_size; v++)
108 {
109 *result++ = (activation_func)(*vector++);
110 }
111}

Referenced by ApplyActivationToVector().

◆ PortableAsymmetricQuantizeFloats()

void nnfw::cker::PortableAsymmetricQuantizeFloats ( const float *  values,
const int  size,
int8_t *  quantized_values,
float *  scaling_factor,
int32_t *  offset 
)
inline

Definition at line 147 of file PortableTensorUtils.h.

150{
151 /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */
152 const int32_t kMinScale = -128;
153 const int32_t kMaxScale = 127;
154 const double qmin_double = kMinScale;
155 const double qmax_double = kMaxScale;
156 const auto minmax = std::minmax_element(values, values + size);
157 const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
158 const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
159 if (rmin == rmax)
160 {
161 memset(quantized_values, 0, size * sizeof(int8_t));
162 *scaling_factor = 1;
163 *offset = 0;
164 return;
165 }
166 else
167 {
168 double scale = (rmax - rmin) / (qmax_double - qmin_double);
169 const double zero_point_from_min = qmin_double - rmin / scale;
170 const double zero_point_from_max = qmax_double - rmax / scale;
171 const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale);
172 const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale);
173 const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
174 ? zero_point_from_min
175 : zero_point_from_max;
176 int8_t nudged_zero_point = 0;
177 if (zero_point_double <= qmin_double)
178 {
179 nudged_zero_point = kMinScale;
180 }
181 else if (zero_point_double >= qmax_double)
182 {
183 nudged_zero_point = kMaxScale;
184 }
185 else
186 {
187 nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
188 }
189 *scaling_factor = scale;
190 *offset = nudged_zero_point;
191 }
192 const float scaling_factor_inv = 1.0f / *scaling_factor;
193 for (int i = 0; i < size; ++i)
194 {
195 const int32_t quantized_value =
196 static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv));
197 quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
198 }
199}

References offset(), and size.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convQ8iHybridPerChannel().

◆ PortableCwiseClipping()

template<typename T >
void nnfw::cker::PortableCwiseClipping ( T *  vector,
const int  v_size,
const T  clipping_value 
)

Definition at line 63 of file PortableTensorUtils.h.

64{
65 for (int i = 0; i < v_size; i++)
66 {
67 vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
68 }
69}

◆ PortableIsZeroVector()

bool nnfw::cker::PortableIsZeroVector ( const float *  vector,
int  v_size 
)
inline

Definition at line 93 of file PortableTensorUtils.h.

94{
95 for (int i = 0; i < v_size; ++i)
96 {
97 if (*vector++ != 0.0f)
98 return false;
99 }
100 return true;
101}

◆ PortableMatrixBatchVectorMultiplyAccumulate() [1/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate ( const float *  matrix,
int  m_rows,
int  m_cols,
const float *  vector,
int  n_batch,
float *  result,
int  result_stride 
)
inline

Definition at line 242 of file PortableTensorUtils.h.

245{
246 float *result_in_batch = result;
247 for (int b = 0; b < n_batch; b++)
248 {
249 const float *matrix_ptr = matrix;
250 for (int r = 0; r < m_rows; r++)
251 {
252 float dot_prod = 0.0f;
253 const float *vector_in_batch = vector + b * m_cols;
254 for (int c = 0; c < m_cols; c++)
255 {
256 dot_prod += *matrix_ptr++ * *vector_in_batch++;
257 }
258 *result_in_batch += dot_prod;
259 result_in_batch += result_stride;
260 }
261 }
262}

◆ PortableMatrixBatchVectorMultiplyAccumulate() [2/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate ( const int8_t *__restrict__  matrix,
const int  m_rows,
const int  m_cols,
const int8_t *__restrict__  vector,
const float *  scaling_factors,
int  n_batch,
int32_t *  ,
float *__restrict__  result,
int  result_stride,
ruy::Context *   
)
inline

Definition at line 231 of file PortableTensorUtils.h.

237{
238 PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
239 n_batch, result, result_stride);
240}
void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *__restrict__ vectors, const float *scaling_factors, int n_batch, float *__restrict__ result, int result_stride)

References PortableMatrixBatchVectorMultiplyAccumulate().

◆ PortableMatrixBatchVectorMultiplyAccumulate() [3/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate ( const int8_t *__restrict__  matrix,
const int  m_rows,
const int  m_cols,
const int8_t *__restrict__  vectors,
const float *  scaling_factors,
int  n_batch,
float *__restrict__  result,
int  result_stride 
)
inline

Definition at line 201 of file PortableTensorUtils.h.

207{
208 int batch, row, col;
209 for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
210 {
211 const float batch_scaling_factor = scaling_factors[batch];
212 // Get the address of the first row.
213 const int8_t *row_ptr = matrix;
214 for (row = 0; row < m_rows; ++row, result += result_stride)
215 {
216 // Initialize the dot product sum for the row to 0.
217 int32_t dotprod = 0;
218#if defined(__GNUC__)
219 // Prefetch the row to cache.
220 __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
221#endif
222 for (col = 0; col < m_cols; ++col, ++row_ptr)
223 {
224 dotprod += (*row_ptr) * (vectors[col]);
225 } // for col
226 *result += (dotprod * batch_scaling_factor);
227 } // for row
228 } // for batch
229}

Referenced by PortableMatrixBatchVectorMultiplyAccumulate().

◆ PortableMeanStddevNormalization()

void nnfw::cker::PortableMeanStddevNormalization ( const float *  input_vector,
float *  output_vector,
int  v_size,
int  n_batch 
)
inline

Definition at line 264 of file PortableTensorUtils.h.

266{
267 for (int batch = 0; batch < n_batch; ++batch)
268 {
269 float sum = 0.0f;
270 for (int i = 0; i < v_size; ++i)
271 {
272 sum += input_vector[i];
273 }
274 const float mean = sum / v_size;
275 float sum_diff_sq = 0.0f;
276 for (int i = 0; i < v_size; ++i)
277 {
278 const float diff = input_vector[i] - mean;
279 sum_diff_sq += diff * diff;
280 }
281 const float variance = sum_diff_sq / v_size;
282 constexpr float kNormalizationConstant = 1e-8f;
283 const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
284 for (int i = 0; i < v_size; ++i)
285 {
286 output_vector[i] = (input_vector[i] - mean) * stddev_inv;
287 }
288 input_vector += v_size;
289 output_vector += v_size;
290 }
291}

Referenced by MeanStddevNormalization().

◆ PortableSub1Vector()

void nnfw::cker::PortableSub1Vector ( const float *  vector,
int  v_size,
float *  result 
)
inline

Definition at line 113 of file PortableTensorUtils.h.

114{
115 for (int v = 0; v < v_size; v++)
116 {
117 *result++ = 1.0f - *vector++;
118 }
119}

◆ PortableSymmetricQuantizeFloats()

void nnfw::cker::PortableSymmetricQuantizeFloats ( const float *  values,
const int  size,
int8_t *  quantized_values,
float *  min_value,
float *  max_value,
float *  scaling_factor 
)
inline

Definition at line 121 of file PortableTensorUtils.h.

124{
125 auto minmax = std::minmax_element(values, values + size);
126 *min_value = *minmax.first;
127 *max_value = *minmax.second;
128 const int kScale = 127;
129 const float range = std::max(std::abs(*min_value), std::abs(*max_value));
130 if (range == 0)
131 {
132 memset(quantized_values, 0, size * sizeof(int8_t));
133 *scaling_factor = 1;
134 return;
135 }
136 *scaling_factor = range / kScale;
137 const float scaling_factor_inv = kScale / range;
138 for (int i = 0; i < size; ++i)
139 {
140 const int32_t quantized_value =
141 static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
142 // Clamp: just in case some odd numeric offset.
143 quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
144 }
145}

References size.

◆ PortableVectorBatchVectorAdd()

void nnfw::cker::PortableVectorBatchVectorAdd ( const float *  vector,
int  v_size,
int  n_batch,
float *  batch_vector 
)
inline

Definition at line 80 of file PortableTensorUtils.h.

82{
83 for (int b = 0; b < n_batch; b++)
84 {
85 for (int i = 0; i < v_size; ++i)
86 {
87 batch_vector[i] += vector[i];
88 }
89 batch_vector += v_size;
90 }
91}

Referenced by VectorBatchVectorAdd().

◆ PortableVectorBatchVectorAssign()

void nnfw::cker::PortableVectorBatchVectorAssign ( const float *  vector,
int  v_size,
int  n_batch,
float *  batch_vector 
)
inline

Definition at line 71 of file PortableTensorUtils.h.

73{
74 for (int b = 0; b < n_batch; b++)
75 {
76 memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
77 }
78}

Referenced by VectorBatchVectorAssign().

◆ PortableZeroVector()

void nnfw::cker::PortableZeroVector ( float *  vector,
int  v_size 
)
inline

Definition at line 293 of file PortableTensorUtils.h.

293{ std::fill_n(vector, v_size, 0); }

Referenced by ZeroVector().

◆ powImpl()

template<typename T >
void nnfw::cker::powImpl ( const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 31 of file Pow.h.

33{
34 const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
35 for (int i = 0; i < flat_size; ++i)
36 {
37 output_data[i] = std::pow(input1_data[i], input2_data[i]);
38 }
39}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::PowLayer::powFloat32().

◆ ProcessBroadcastShapes()

bool nnfw::cker::ProcessBroadcastShapes ( const Shape shape0,
const Shape shape1,
BinaryArithmeticOpParam params 
)
inline

Definition at line 109 of file BinaryArithmeticOps.h.

111{
112 const int dims_count = std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
113
114 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
115 Shape scalar_shape(dims_count, 1);
116
117 auto extended_shape0 = Shape::ExtendedShape(dims_count, shape0);
118 auto extended_shape1 = Shape::ExtendedShape(dims_count, shape1);
119
120 // Check for "exact" match, implicitly accepting any scalar shapes.
121 if (extended_shape0 == extended_shape1)
122 {
123 params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
124 return false;
125 }
126
127 for (int i = dims_count - 1; i >= 0; --i)
128 {
129 if (extended_shape0.Dims(i) == extended_shape1.Dims(i))
130 {
131 continue;
132 }
133 else if (extended_shape0.Dims(i) == 1)
134 {
135 params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast;
136 break;
137 }
138 else if (extended_shape1.Dims(i) == 1)
139 {
140 params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast;
141 break;
142 }
143 else
144 {
145 // This case is erroneous: there is a dimension that does not match and
146 // is not a broadcast from one shape to the other.
147 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
148 return true;
149 }
150 }
151
152 if (params->broadcast_category != BroadcastableOpCategory::kFirstInputBroadcastsFast &&
153 params->broadcast_category != BroadcastableOpCategory::kSecondInputBroadcastsFast)
154 {
155 return false;
156 }
157
158 // From this point it is assumed contractually that corresponding dimensions
159 // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
160 const bool swap_inputs =
161 params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
162 const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
163 const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
164
165 int i = dims_count - 1;
166 params->broadcast_shape[0] = 1;
167 params->broadcast_shape[1] = 1;
168 params->broadcast_shape[2] = 1;
169 params->broadcast_shape[3] = 1;
170 params->broadcast_shape[4] = 1;
171 // y_0 is greedy: include dims if both or neither equal 1: in other words,
172 // test for equality rather than (shape_a->Dims(i) != 1).
173 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
174 {
175 params->broadcast_shape[4] *= shape_b->Dims(i);
176 --i;
177 }
178 // Here either input_a or input_b has dim of 1 (if i >= 0). If it is input_b
179 // that has the unit dimension, the next two loops are not entered.
180 while (i >= 0 && shape_a->Dims(i) == 1)
181 {
182 params->broadcast_shape[3] *= shape_b->Dims(i);
183 --i;
184 }
185 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
186 {
187 params->broadcast_shape[2] *= shape_a->Dims(i);
188 --i;
189 }
190 // Here either input_a or input_b has dim of 1 (if i >= 0).
191 while (i >= 0 && shape_b->Dims(i) == 1)
192 {
193 params->broadcast_shape[1] *= shape_a->Dims(i);
194 --i;
195 }
196 while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
197 {
198 params->broadcast_shape[0] *= shape_b->Dims(i);
199 --i;
200 }
201
202 // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
203 // loop.
204 if (i >= 0)
205 {
206 params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
207 }
208 return true;
209}
BroadcastableOpCategory broadcast_category
Definition Types.h:181

References nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::BinaryArithmeticOpParam::broadcast_shape, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), kFirstInputBroadcastsFast, kGenericBroadcast, kNonBroadcast, and kSecondInputBroadcastsFast.

◆ Quantize() [1/5]

void nnfw::cker::Quantize ( const int32_t *  multiplier,
const int32_t *  shift,
int32_t  channel_size,
int32_t  total_size,
int32_t  output_zp,
int32_t  output_min,
int32_t  output_max,
int32_t *  scratch,
int8_t *  output 
)
inline

Definition at line 207 of file Quantize.h.

210{
211 // Here we're trying to quantize the raw accumulators:
212 // output_channels
213 // data data data data data
214 // rows data data data data data
215 // data data data data data
216 // ....
217 //
218 // In order to minimize the reload of the multipliers & shifts, once we load
219 // the multipliers & shifts, we load & quantize the raw accumulators for every
220 // row.
221#ifdef USE_NEON
222 const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
223 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
224 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
225 const int32x4_t zeros = vdupq_n_s32(0);
226#endif
227
228 assert(total_size % channel_size == 0);
229 const int32_t rows = total_size / channel_size;
230
231 int c = 0;
232
233#ifdef USE_NEON
234 using gemmlowp::RoundingDivideByPOT;
235 for (; c <= channel_size - 8; c += 8)
236 {
237 int32x4_t out_shift_1 = vld1q_s32(shift + c);
238 int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
239 int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
240 int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
241
242 // Right shift will be performed as left shift with negative values.
243 int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
244 int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
245
246 int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
247 int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
248 for (int n = 0; n < rows; ++n)
249 {
250 int loc = n * channel_size + c;
251 int32x4_t acc_1 = vld1q_s32(scratch + loc);
252 int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
253
254 // Saturating Rounding Doubling High Mul.
255 acc_1 = vshlq_s32(acc_1, left_shift_1);
256 acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
257 acc_2 = vshlq_s32(acc_2, left_shift_2);
258 acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
259
260 // Rounding Dividing By POT.
261 acc_1 = vrshlq_s32(acc_1, right_shift_1);
262 acc_2 = vrshlq_s32(acc_2, right_shift_2);
263
264 // Add the output offset.
265 acc_1 = vaddq_s32(acc_1, output_offset_vec);
266 acc_2 = vaddq_s32(acc_2, output_offset_vec);
267
268 // Apply the activation function.
269 acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
270 acc_1 = vminq_s32(acc_1, output_activation_max_vec);
271 acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
272 acc_2 = vminq_s32(acc_2, output_activation_max_vec);
273
274 // Saturating cast to int8 and store to destination.
275 const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
276 const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
277 const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
278 const int8x8_t res_s8 = vqmovn_s16(res_s16);
279 vst1_s8(output + loc, res_s8);
280 }
281 }
282
283#endif // USE_NEON
284 // Handle leftover values, one by one. This is very slow.
285 for (; c < channel_size; c++)
286 {
287 for (int n = 0; n < rows; ++n)
288 {
289 int loc = n * channel_size + c;
290 int32_t acc = scratch[loc];
291 acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
292 acc += output_zp;
293 acc = std::max(acc, output_min);
294 acc = std::min(acc, output_max);
295 output[loc] = static_cast<int8_t>(acc);
296 }
297 }
298}

References MultiplyByQuantizedMultiplier().

◆ Quantize() [2/5]

template<>
void nnfw::cker::Quantize ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
int16_t *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 156 of file Quantize.h.

158{
159 const int flat_size = MatchingFlatSize(input_shape, output_shape);
160 static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
161 static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
162
163 int i = 0;
164#ifdef USE_NEON
165 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
166 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
167 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
168 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
169
170 for (; i <= flat_size - 8; i += 8)
171 {
172 const float *src_data_ptr = input_data + i;
173 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
174 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
175
176 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
177 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
178
179 int32x4_t casted_val_0 = RoundToNearest(input_val_0);
180 int32x4_t casted_val_1 = RoundToNearest(input_val_1);
181
182 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
183 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
184
185 // Clamp the values to fit the target type's range.
186 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
187 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
188 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
189 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
190
191 const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
192 const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
193 vst1_s16(output_data + i, narrowed_val_0);
194 vst1_s16(output_data + i + 4, narrowed_val_1);
195 }
196#endif // NEON
197
198 for (; i < flat_size; ++i)
199 {
200 const float val = input_data[i];
201 const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
202 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
203 output_data[i] = clamped;
204 }
205}
float RoundToNearest(float value)
Definition Round.h:31

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [3/5]

template<>
void nnfw::cker::Quantize ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
int8_t *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 50 of file Quantize.h.

52{
53 const int flat_size = MatchingFlatSize(input_shape, output_shape);
54 static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
55 static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
56
57 int i = 0;
58#ifdef USE_NEON
59 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
60 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
61 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
62 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
63
64 for (; i <= flat_size - 8; i += 8)
65 {
66 const float *src_data_ptr = input_data + i;
67 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
68 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
69
70 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
71 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
72
73 int32x4_t casted_val_0 = RoundToNearest(input_val_0);
74 int32x4_t casted_val_1 = RoundToNearest(input_val_1);
75
76 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
77 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
78
79 // Clamp the values to fit the target type's range.
80 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
81 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
82 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
83 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
84
85 const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
86 const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
87 const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
88 const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
89 vst1_s8(output_data + i, combined_val_narrowed);
90 }
91#endif // NEON
92
93 for (; i < flat_size; ++i)
94 {
95 const float val = input_data[i];
96 const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
97 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
98 output_data[i] = clamped;
99 }
100}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [4/5]

template<>
void nnfw::cker::Quantize ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
uint8_t *  output_data,
const float  scale,
const int32_t  zero_point 
)
inline

Definition at line 103 of file Quantize.h.

105{
106 const int flat_size = MatchingFlatSize(input_shape, output_shape);
107 static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
108 static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
109
110 int i = 0;
111#ifdef USE_NEON
112 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
113 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
114 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
115 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
116
117 for (; i <= flat_size - 8; i += 8)
118 {
119 const float *src_data_ptr = input_data + i;
120 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
121 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
122
123 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
124 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
125
126 int32x4_t casted_val_0 = RoundToNearest(input_val_0);
127 int32x4_t casted_val_1 = RoundToNearest(input_val_1);
128
129 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
130 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
131
132 // Clamp the values to fit the target type's range.
133 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
134 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
135 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
136 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
137
138 const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
139 const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
140 const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1);
141 const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
142 vst1_u8(output_data + i, combined_val_narrowed);
143 }
144#endif // NEON
145
146 for (; i < flat_size; ++i)
147 {
148 const float val = input_data[i];
149 const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
150 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
151 output_data[i] = clamped;
152 }
153}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [5/5]

template<typename InputT , typename OutputT >
void nnfw::cker::Quantize ( const Shape input_shape,
const InputT *  input_data,
const Shape output_shape,
OutputT *  output_data,
const float  output_scale,
const int32_t  output_offset 
)
inline

Definition at line 34 of file Quantize.h.

36{
37 const int flat_size = MatchingFlatSize(input_shape, output_shape);
38 int min_val = std::numeric_limits<OutputT>::min();
39 int max_val = std::numeric_limits<OutputT>::max();
40
41 for (int i = 0; i < flat_size; i++)
42 {
43 int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
44 int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
45 output_data[i] = clamped;
46 }
47}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::affineQuantize(), and nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral().

◆ QuantizeMultiplier()

void nnfw::cker::QuantizeMultiplier ( double  double_multiplier,
int32_t *  quantized_multiplier,
int *  shift 
)
inline

Definition at line 48 of file Utils.h.

49{
50 if (double_multiplier == 0.)
51 {
52 *quantized_multiplier = 0;
53 *shift = 0;
54 return;
55 }
56
57 const double q = std::frexp(double_multiplier, shift);
58 auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
59
60 assert(q_fixed <= (1ll << 31));
61 if (q_fixed == (1ll << 31))
62 {
63 q_fixed /= 2;
64 ++*shift;
65 }
66 assert(q_fixed <= std::numeric_limits<int32_t>::max());
67 // A shift amount smaller than -31 would cause all bits to be shifted out
68 // and thus all results would be zero. We implement that instead with
69 // q_fixed==0, so as to avoid hitting issues with right-shift
70 // operations with shift amounts greater than 31. Note that this happens
71 // roughly when abs(double_multiplier) < 2^-31 and the present handling means
72 // that we're effectively flushing tiny double_multiplier's to zero.
73 // We could conceivably handle values in the range (roughly) [32, 63]
74 // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
75 // the present handling is just doing 'flush denormals to zero'. We could
76 // reconsider and actually generate nonzero denormals if a need arises.
77 if (*shift < -31)
78 {
79 *shift = 0;
80 q_fixed = 0;
81 }
82 *quantized_multiplier = static_cast<int32_t>(q_fixed);
83}

Referenced by QuantizeMultiplierSmallerThanOneExp().

◆ QuantizeMultiplierSmallerThanOneExp()

void nnfw::cker::QuantizeMultiplierSmallerThanOneExp ( double  double_multiplier,
int32_t *  quantized_multiplier,
int *  left_shift 
)
inline

Definition at line 85 of file Utils.h.

87{
88 assert(double_multiplier < 1.0);
89 assert(double_multiplier > 0.0);
90 int shift;
91 QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
92 assert(shift <= 0);
93 *left_shift = shift;
94}
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition Utils.h:48

References QuantizeMultiplier().

◆ QuantizeSoftmaxOutput()

template<typename T >
int32_t nnfw::cker::QuantizeSoftmaxOutput ( float  prob_rescaled,
int32_t  zero_point 
)
inline

Definition at line 134 of file SoftMax.h.

135{
136 const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
137 return prob_rnd + zero_point;
138}

◆ QuantizeSoftmaxOutput< uint8_t >()

template<>
int32_t nnfw::cker::QuantizeSoftmaxOutput< uint8_t > ( float  prob_rescaled,
int32_t   
)
inline

Definition at line 142 of file SoftMax.h.

143{
144 return static_cast<int32_t>(prob_rescaled + 0.5f);
145}

◆ Range()

template<typename T >
void nnfw::cker::Range ( const T *  start_data,
const T *  limit_data,
const T *  delta_data,
T *  output_data 
)
inline

Definition at line 44 of file Range.h.

45{
46 const T start_value = *start_data;
47 const T delta_value = *delta_data;
48 const T limit_value = *limit_data;
49
50 const int num_elements = GetSize<T>(start_value, limit_value, delta_value);
51 T value = start_value;
52
53 for (int i = 0; i < num_elements; ++i)
54 {
55 output_data[i] = value;
56 value += delta_value;
57 }
58}

◆ RankOneSelect()

template<typename D , typename T >
void nnfw::cker::RankOneSelect ( const Shape input_condition_shape,
const D *  input_condition_data,
const Shape input_x_shape,
const T *  input_x_data,
const Shape input_y_shape,
const T *  input_y_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 45 of file Select.h.

48{
49 const int64_t outer_size = input_condition_shape.FlatSize();
50 assert(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0) == outer_size);
51 const int64_t inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
52
53 int64_t offset = 0;
54 for (int64_t i = 0; i < outer_size; i++)
55 {
56 const T *input_data = (input_condition_data[i] != 0) ? input_x_data : input_y_data;
57 memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
58 offset += inner_size;
59 }
60}

References nnfw::cker::Shape::FlatSize(), MatchingDim(), MatchingFlatSizeSkipDim(), offset(), and output_shape.

◆ ReducedOutputOffset()

size_t nnfw::cker::ReducedOutputOffset ( const int  num_dims,
const int *  dims,
const int *  index,
const int  num_axis,
const int *  axis 
)
inline

Definition at line 420 of file Utils.h.

422{
423 if (num_dims == 0)
424 {
425 return 0;
426 }
427
428 assert(dims != nullptr);
429 assert(index != nullptr);
430
431 size_t offset = 0;
432 for (int idx = 0; idx < num_dims; ++idx)
433 {
434 // if we need to skip this axis
435 bool is_axis = false;
436 if (axis != nullptr)
437 {
438 for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
439 {
440 if (idx == axis[axis_idx])
441 {
442 is_axis = true;
443 break;
444 }
445 }
446 }
447 if (!is_axis)
448 {
449 offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
450 }
451 }
452 return offset;
453}

References offset().

Referenced by ReduceImpl(), ReduceMeanImpl(), and ReduceSumQuantImpl().

◆ ReduceImpl()

template<typename In , typename Out >
bool nnfw::cker::ReduceImpl ( const In *  input_data,
const Shape input_shape,
const Shape ,
const int *  axis,
const int  num_axis,
int *  input_iter,
Out   reducerconst Out current, const In in,
Out *  output_data 
)
inline

Definition at line 118 of file Reduce.h.

121{
122 const auto input_dims = input_shape.DimsData();
123 const auto input_num_dims = input_shape.DimensionsCount();
124
125 // Reset input iterator.
126 if (num_axis == 1 && axis[0] == input_num_dims - 1)
127 {
128 int input_size = 1;
129 int reduce_size = 0;
130 for (int idx = 0; idx < input_num_dims - 1; idx++)
131 {
132 input_size *= input_dims[idx];
133 }
134 reduce_size = input_dims[input_num_dims - 1];
135 for (int idx = 0; idx < input_size; idx++)
136 {
137 for (int r_idx = 0; r_idx < reduce_size; r_idx++)
138 {
139 if (r_idx == 0)
140 {
141 output_data[idx] = input_data[idx * reduce_size];
142 }
143 else
144 {
145 output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
146 }
147 }
148 }
149 return true;
150 }
151
152 for (int idx = 0; idx < input_num_dims; ++idx)
153 {
154 input_iter[idx] = 0;
155 }
156 // Iterate through input_data.
157 do
158 {
159 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
160 size_t output_offset =
161 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
162 output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
163 } while (NextIndex(input_num_dims, input_dims, input_iter));
164 return true;
165}
bool NextIndex(const int num_dims, const int *dims, int *current)
Definition Utils.h:387
size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, const int num_axis, const int *axis)
Definition Utils.h:420

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReduceMeanImpl()

template<typename In , typename Out >
bool nnfw::cker::ReduceMeanImpl ( const In *  input_data,
const Shape input_shape,
const int *  axis,
const int  num_axis,
int *  input_iter,
Out   reducerconst Out current, const In in, int normalizer,
Out *  output_data 
)
inline

Definition at line 52 of file ReduceMean.h.

56{
57 const auto input_dims = input_shape.DimsData();
58 const auto input_num_dims = input_shape.DimensionsCount();
59 int normalizer = 1;
60 // Reset input iterator.
61 for (int idx = 0; idx < input_num_dims; ++idx)
62 {
63 input_iter[idx] = 0;
64 }
65 // Compute number of output elements
66 for (int idx = 0; idx < num_axis; ++idx)
67 {
68 normalizer *= input_dims[axis[idx]];
69 }
70 // Iterate through input_data.
71 do
72 {
73 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
74 size_t output_offset =
75 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
76 output_data[output_offset] =
77 reducer(output_data[output_offset], input_data[input_offset], normalizer);
78 } while (NextIndex(input_num_dims, input_dims, input_iter));
79 return true;
80}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReduceSumQuantImpl()

template<typename In >
size_t nnfw::cker::ReduceSumQuantImpl ( const In *  input_data,
const Shape input_shape,
const int *  axis,
const int  num_axis,
int *  input_iter,
int   reducerconst int current, const In in,
int *  temp_sum 
)
inline

Definition at line 83 of file ReduceMean.h.

86{
87 const auto input_dims = input_shape.DimsData();
88 const auto input_num_dims = input_shape.DimensionsCount();
89 size_t normalizer = 1;
90 // Reset input iterator.
91 for (int idx = 0; idx < input_num_dims; ++idx)
92 {
93 input_iter[idx] = 0;
94 }
95 // Compute number of output elements
96 for (int idx = 0; idx < num_axis; ++idx)
97 {
98 normalizer *= input_dims[axis[idx]];
99 }
100 // Iterate through input_data.
101 do
102 {
103 size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
104 size_t output_offset =
105 ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
106 temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
107 } while (NextIndex(input_num_dims, input_dims, input_iter));
108 return normalizer;
109}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReLU()

void nnfw::cker::ReLU ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 32 of file ReLU.h.

34{
35 const auto input_map = MapAsVector(input_data, input_shape);
36 auto output_map = MapAsVector(output_data, output_shape);
37 output_map = input_map.cwiseMax(0.0f);
38}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ ReLU6()

void nnfw::cker::ReLU6 ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 32 of file ReLU6.h.

34{
35 const auto input_map = MapAsVector(input_data, input_shape);
36 auto output_map = MapAsVector(output_data, output_shape);
37
38 if (output_shape != input_shape)
39 throw std::runtime_error{"cker::ReLU6: Do not match input and output shapes."};
40
41 output_map = input_map.cwiseMax(0.0f).cwiseMin(6.0f);
42}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ Requantize()

template<typename input_type , typename output_type >
void nnfw::cker::Requantize ( const input_type *  input_data,
int32_t  size,
int32_t  effective_scale_multiplier,
int32_t  effective_scale_shift,
int32_t  input_zeropoint,
int32_t  output_zeropoint,
output_type *  output_data 
)
inline

Definition at line 301 of file Quantize.h.

304{
305 assert(!"Requantize: not supported type. It shouldn't reach here.");
306 UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
307 output_zeropoint, output_data);
308}

References size.

◆ Requantize< int8_t, uint8_t >()

template<>
void nnfw::cker::Requantize< int8_t, uint8_t > ( const int8_t *  input_data,
int32_t  size,
int32_t  effective_scale_multiplier,
int32_t  effective_scale_shift,
int32_t  input_zeropoint,
int32_t  output_zeropoint,
uint8_t *  output_data 
)
inline

Definition at line 379 of file Quantize.h.

383{
384 static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
385 static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
386
387 int i = 0;
388#ifdef USE_NEON
389 // Constants.
390 const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
391 const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
392 const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
393 const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
394
395 for (; i <= size - 16; i += 16)
396 {
397 const int8x16_t input_vec = vld1q_s8(input_data + i);
398 const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
399 const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
400 int32x4x4_t input;
401 input.val[0] = vmovl_s16(vget_low_s16(first_half));
402 input.val[1] = vmovl_s16(vget_high_s16(first_half));
403 input.val[2] = vmovl_s16(vget_low_s16(second_half));
404 input.val[3] = vmovl_s16(vget_high_s16(second_half));
405 input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
406 input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
407 input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
408 input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
409
410 int32x4x4_t result =
411 MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
412
413 result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
414 result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
415 result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
416 result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
417 result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
418 result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
419 result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
420 result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
421
422 const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
423 const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
424 const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
425 const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
426
427 const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
428 const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
429 const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
430 const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
431 const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
432 const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
433 const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
434 const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
435 const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
436 vst1q_u8(output_data + i, narrowed_result);
437 }
438
439#endif
440 for (; i < size; ++i)
441 {
442 const int32_t input = input_data[i] - input_zeropoint;
443 const int32_t output =
444 MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
445 output_zeropoint;
446 const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
447 output_data[i] = static_cast<uint8_t>(clamped_output);
448 }
449}

References MultiplyByQuantizedMultiplier(), and size.

Referenced by onert::backend::cpu::ops::QuantizeLayer::run().

◆ Requantize< uint8_t, int8_t >()

template<>
void nnfw::cker::Requantize< uint8_t, int8_t > ( const uint8_t *  input_data,
int32_t  size,
int32_t  effective_scale_multiplier,
int32_t  effective_scale_shift,
int32_t  input_zeropoint,
int32_t  output_zeropoint,
int8_t *  output_data 
)
inline

Definition at line 311 of file Quantize.h.

315{
316 static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
317 static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
318
319 int i = 0;
320#ifdef USE_NEON
321 // Constants.
322 const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
323 const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
324 const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
325 const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
326
327 for (; i <= size - 16; i += 16)
328 {
329 const uint8x16_t input_vec = vld1q_u8(input_data + i);
330 const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
331 const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
332 int32x4x4_t input;
333 input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
334 input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
335 input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
336 input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
337 input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
338 input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
339 input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
340 input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
341
342 int32x4x4_t result =
343 MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
344
345 result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
346 result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
347 result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
348 result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
349 result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
350 result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
351 result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
352 result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
353
354 const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
355 const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
356 const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
357 const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
358 const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
359 const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
360 const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
361 const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
362 const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
363 vst1q_s8(output_data + i, narrowed_result);
364 }
365
366#endif
367 for (; i < size; ++i)
368 {
369 const int32_t input = input_data[i] - input_zeropoint;
370 const int32_t output =
371 MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
372 output_zeropoint;
373 const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
374 output_data[i] = static_cast<int8_t>(clamped_output);
375 }
376}

References MultiplyByQuantizedMultiplier(), and size.

Referenced by onert::backend::cpu::ops::QuantizeLayer::run().

◆ ResizeBilinear() [1/3]

void nnfw::cker::ResizeBilinear ( const ResizeBilinearParams op_params,
const Shape unextended_input_shape,
const int8_t *  input_data,
const Shape unextended_output_shape,
int8_t *  output_data 
)
inline

Definition at line 285 of file ResizeBilinear.h.

288{
289 // If half_pixel_centers is True, align_corners must be False.
290 assert(!op_params.half_pixel_centers || !op_params.align_corners);
291 assert(unextended_input_shape.DimensionsCount() <= 4);
292 assert(unextended_output_shape.DimensionsCount() <= 4);
293 const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
294 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
295
296 const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
297 const int32_t input_height = input_shape.Dims(1);
298 const int32_t input_width = input_shape.Dims(2);
299 const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
300
301 const int32_t output_height = op_params.output_height;
302 const int32_t output_width = op_params.output_width;
303
304 int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
305 int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
306 if (op_params.align_corners && output_height > 1)
307 {
308 height_scale_10 =
309 ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
310 }
311 if (op_params.align_corners && output_width > 1)
312 {
313 width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
314 }
315
316 for (int b = 0; b < batches; ++b)
317 {
318 for (int y = 0; y < output_height; ++y)
319 {
320 int32_t input_y, y0, y1;
321 ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
322 &input_y, &y0, &y1);
323 for (int x = 0; x < output_width; ++x)
324 {
325 int32_t input_x, x0, x1;
326 ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
327 &input_x, &x0, &x1);
328 for (int c = 0; c < depth; ++c)
329 {
330 const int64_t output_20_ll =
331 static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
332 ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
333 const int64_t output_20_lu =
334 static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
335 (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
336 const int64_t output_20_rl =
337 static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
338 ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
339 const int64_t output_20_ru =
340 static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
341 (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
342 const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
343 const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
344 const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
345 output_data[Offset(output_shape, b, y, x, c)] = interpolation;
346 }
347 }
348 }
349 }
350}

References nnfw::cker::ResizeBilinearParams::align_corners, ComputeInterpolationValues(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), Offset(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, and nnfw::cker::ResizeBilinearParams::output_width.

◆ ResizeBilinear() [2/3]

void nnfw::cker::ResizeBilinear ( ResizeBilinearParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)

Definition at line 213 of file ResizeBilinear.h.

215{
216 int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0));
217 int32_t input_height = input_shape.Dims(1);
218 int32_t input_width = input_shape.Dims(2);
219 int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3));
220
221 // Specialize for 2x2 upsample.
222 if (!params.align_corners && !params.half_pixel_centers &&
223 params.output_height == 2 * input_height && params.output_width == 2 * input_width)
224 {
225 ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height,
226 params.output_width, input_shape, input_data, output_shape, output_data);
227 }
228 else
229 {
230 float height_scale = static_cast<float>(input_height) / params.output_height;
231 float width_scale = static_cast<float>(input_width) / params.output_width;
232 if (params.align_corners && params.output_height > 1)
233 {
234 height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1);
235 }
236 if (params.align_corners && params.output_width > 1)
237 {
238 width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1);
239 }
240
241 ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height,
242 params.output_width, height_scale, width_scale, input_shape, input_data,
243 output_data, params.half_pixel_centers);
244 }
245}
void ResizeBilinear2x2(int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)
void ResizeBilinearGeneric(int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, float height_scale, float width_scale, const Shape &input_shape, const float *input_data, float *output_data, const bool half_pixel_centers)

References nnfw::cker::ResizeBilinearParams::align_corners, nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, nnfw::cker::ResizeBilinearParams::output_width, ResizeBilinear2x2(), and ResizeBilinearGeneric().

Referenced by onert::backend::cpu::ops::ResizeBilinearLayer::run().

◆ ResizeBilinear() [3/3]

void nnfw::cker::ResizeBilinear ( ResizeBilinearParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape output_shape,
uint8_t *  output_data 
)

Definition at line 247 of file ResizeBilinear.h.

249{
250 int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
251 int32_t input_height = input_shape.Dims(1);
252 int32_t input_width = input_shape.Dims(2);
253 int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
254
255 float height_scale = (params.align_corners && params.output_height > 1)
256 ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
257 : (static_cast<float>(input_height) / params.output_height);
258
259 float width_scale = (params.align_corners && params.output_width > 1)
260 ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
261 : (static_cast<float>(input_width) / params.output_width);
262
263 ResizeBilinearGenericSmallChannel<uint8_t>(
264 batches, input_height, input_width, depth, params.output_height, params.output_width,
265 height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
266}

References nnfw::cker::ResizeBilinearParams::align_corners, nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, and nnfw::cker::ResizeBilinearParams::output_width.

◆ ResizeBilinear2x2()

void nnfw::cker::ResizeBilinear2x2 ( int32_t  batches,
int32_t  input_height,
int32_t  input_width,
int32_t  depth,
int32_t  output_height,
int32_t  output_width,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 69 of file ResizeBilinear.h.

73{
74 for (int b = 0; b < batches; b++)
75 {
76 for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++)
77 {
78 for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++)
79 {
80 int32_t x1 = std::min(x0 + 1, input_width - 1);
81 int32_t y1 = std::min(y0 + 1, input_height - 1);
82 ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data,
83 output_shape, output_data);
84 }
85 }
86 }
87}
void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x, int32_t y, int32_t depth, int32_t batch, const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data)

References output_shape, and ResizeBilinearKernel2x2().

Referenced by ResizeBilinear().

◆ ResizeBilinearGeneric()

void nnfw::cker::ResizeBilinearGeneric ( int32_t  batches,
int32_t  input_height,
int32_t  input_width,
int32_t  depth,
int32_t  output_height,
int32_t  output_width,
float  height_scale,
float  width_scale,
const Shape input_shape,
const float *  input_data,
float *  output_data,
const bool  half_pixel_centers 
)
inline

Definition at line 118 of file ResizeBilinear.h.

123{
124 memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float));
125
126 int32_t output_offset = 0;
127 for (int b = 0; b < batches; ++b)
128 {
129 for (int y = 0; y < output_height; ++y)
130 {
131 float input_y;
132 int32_t y0, y1;
133 ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
134 &y1);
135 for (int x = 0; x < output_width; ++x)
136 {
137 float input_x;
138 int32_t x0, x1;
139 ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
140 &x1);
141 float *output_ptr = &output_data[output_offset];
142
143 // Run kernel on the 4 corners of the bilinear resize algorithm.
144 int32_t input_offset = Offset(input_shape, b, y0, x0, 0);
145 float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
146 const float *input_ptr = &input_data[input_offset];
147 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
148
149 input_offset = Offset(input_shape, b, y0, x1, 0);
150 scale = (1 - (input_y - y0)) * (input_x - x0);
151 input_ptr = &input_data[input_offset];
152 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
153
154 input_offset = Offset(input_shape, b, y1, x0, 0);
155 scale = (input_y - y0) * (1 - (input_x - x0));
156 input_ptr = &input_data[input_offset];
157 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
158
159 input_offset = Offset(input_shape, b, y1, x1, 0);
160 scale = (input_y - y0) * (input_x - x0);
161 input_ptr = &input_data[input_offset];
162 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
163
164 output_offset += depth;
165 }
166 }
167 }
168}
void ResizeBilinearKernel(const float *input_ptr, int32_t depth, float scale, float *output_ptr)
void ComputeInterpolationValues(const float value, const float scale, const bool half_pixel_centers, int32_t input_size, float *scaled_value, int32_t *lower_bound, int32_t *upper_bound)

References ComputeInterpolationValues(), Offset(), and ResizeBilinearKernel().

Referenced by ResizeBilinear().

◆ ResizeBilinearGenericSmallChannel()

template<typename T >
void nnfw::cker::ResizeBilinearGenericSmallChannel ( int32_t  batches,
int32_t  input_height,
int32_t  input_width,
int32_t  depth,
int32_t  output_height,
int32_t  output_width,
float  height_scale,
float  width_scale,
const Shape input_shape,
const T *  input_data,
T *  output_data,
const bool  half_pixel_centers 
)
inline

Definition at line 171 of file ResizeBilinear.h.

177{
178 T *output_ptr = &output_data[0];
179 for (int b = 0; b < batches; ++b)
180 {
181 for (int y = 0; y < output_height; ++y)
182 {
183 float input_y;
184 int32_t y0, y1;
185 ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
186 &y1);
187 for (int x = 0; x < output_width; ++x)
188 {
189 float input_x;
190 int32_t x0, x1;
191 ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
192 &x1);
193
194 int32_t input_offset[4] = {
195 Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
196 Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
197 float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
198 (1 - (input_y - y0)) * (input_x - x0),
199 (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
200
201 for (int d = 0; d < depth; d++)
202 {
203 const T *input_ptr = &input_data[d];
204 *output_ptr++ = static_cast<T>(
205 input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
206 input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
207 }
208 }
209 }
210 }
211}

References ComputeInterpolationValues(), and Offset().

◆ ResizeBilinearKernel()

void nnfw::cker::ResizeBilinearKernel ( const float *  input_ptr,
int32_t  depth,
float  scale,
float *  output_ptr 
)
inline

Definition at line 89 of file ResizeBilinear.h.

91{
92 for (int32_t i = 0; i < depth; i++)
93 {
94 *output_ptr += *input_ptr * scale;
95 output_ptr++;
96 input_ptr++;
97 }
98}

Referenced by ResizeBilinearGeneric().

◆ ResizeBilinearKernel2x2()

void nnfw::cker::ResizeBilinearKernel2x2 ( int32_t  x0,
int32_t  x1,
int32_t  y0,
int32_t  y1,
int32_t  x,
int32_t  y,
int32_t  depth,
int32_t  batch,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 30 of file ResizeBilinear.h.

34{
35 const int32_t input_width = input_shape.Dims(2);
36 const int32_t output_width = output_shape.Dims(2);
37
38 const int32_t input_x_offset = (x1 - x0) * depth;
39 const int32_t input_y_offset = (y1 - y0) * depth * input_width;
40 const int32_t output_x_offset = depth;
41 const int32_t output_y_offset = depth * output_width;
42
43 for (int ch = 0; ch < depth; ch++)
44 {
45 const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch);
46
47 float x0y0 = input_data[input_offset];
48 float x1y0 = input_data[input_offset + input_x_offset];
49 float x0y1 = input_data[input_offset + input_y_offset];
50 float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
51
52 // Top left corner.
53 const int32_t output_offset = Offset(output_shape, batch, y, x, ch);
54 output_data[output_offset] = x0y0;
55
56 // Top right corner.
57 output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
58
59 // Bottom left corner.
60 float output = (x0y0 + x0y1) / 2;
61 output_data[output_offset + output_y_offset] = output;
62
63 // Bottom right corner.
64 output_data[output_offset + output_x_offset + output_y_offset] =
65 (output + ((x1y0 + x1y1) / 2)) / 2;
66 }
67}

References nnfw::cker::Shape::Dims(), Offset(), and output_shape.

Referenced by ResizeBilinear2x2().

◆ ResolveAxis()

bool nnfw::cker::ResolveAxis ( const int  num_dims,
const std::vector< int > &  axes,
int *  out_axis,
int *  out_num_axis 
)
inline

Definition at line 169 of file Reduce.h.

171{
172 auto num_axis = axes.size();
173 auto axis = axes.data();
174
175 *out_num_axis = 0; // Just in case.
176 // Short-circuit axis resolution for scalars; the axis will go unused.
177 if (num_dims == 0)
178 {
179 return true;
180 }
181 // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
182 for (size_t idx = 0; idx < num_axis; ++idx)
183 {
184 // Handle negative index. A positive index 'p_idx' can be represented as a
185 // negative index 'n_idx' as: n_idx = p_idx-num_dims
186 // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
187 int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
188 assert(current >= 0 && current < num_dims);
189 bool is_dup = false;
190 for (int j = 0; j < *out_num_axis; ++j)
191 {
192 if (out_axis[j] == current)
193 {
194 is_dup = true;
195 break;
196 }
197 }
198 if (!is_dup)
199 {
200 out_axis[*out_num_axis] = current;
201 *out_num_axis += 1;
202 }
203 }
204 return true;
205}

Referenced by nnfw::cker::ReduceMean::PrepareforReduce(), nnfw::cker::Reduce::QuantizedMeanOrSum(), and nnfw::cker::Reduce::ReduceGeneric().

◆ Reverse()

template<typename Scalar >
void nnfw::cker::Reverse ( int  axis,
const Shape input_shape,
const Scalar *  input_data,
const Shape ,
Scalar *  output_data 
)

Definition at line 31 of file Reverse.h.

33{
34 int outer_size = 1;
35 for (int i = 0; i < axis; ++i)
36 {
37 outer_size *= input_shape.Dims(i);
38 }
39
40 int copy_size = 1;
41 for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
42 {
43 copy_size *= input_shape.Dims(i);
44 }
45
46 const int dims_at_axis = input_shape.Dims(axis);
47 for (int i = 0; i < outer_size; ++i)
48 {
49 for (int j = 0; j < dims_at_axis; ++j)
50 {
51 const int start_pos = (i * dims_at_axis + j) * copy_size;
52 Scalar *output_ptr = output_data + start_pos;
53 int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
54 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
55 }
56 }
57}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::Dims().

Referenced by nnfw::cker::BCastList< N >::BCastList().

◆ RmsNorm()

void nnfw::cker::RmsNorm ( const RmsNormParams params,
const Shape input_shape,
const float *  input_data,
const Shape gamma_shape,
const float *  gamma_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 32 of file RmsNorm.h.

35{
36 bool single_gamma = gamma_shape.DimensionsCount() == 1 && gamma_shape.Dims(0) == 1;
37
38 if (input_shape.DimensionsCount() == 4)
39 {
40 const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
41 const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
42 const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
43 const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
44
45 for (int32_t batch = 0; batch < batches; batch++)
46 {
47 for (int32_t height = 0; height < heights; height++)
48 {
49 for (int32_t width = 0; width < widths; width++)
50 {
51 // normalize over last-axis
52 double square_sum = 0.0f;
53 for (int32_t channel = 0; channel < channels; channel++)
54 {
55 double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
56 square_sum += (input_val * input_val);
57 }
58 double rms = std::sqrt((square_sum / channels) + params.epsilon);
59 for (int32_t channel = 0; channel < channels; channel++)
60 {
61 double gamma = (single_gamma ? gamma_data[0] : gamma_data[channel]);
62 output_data[Offset(output_shape, batch, height, width, channel)] =
63 gamma * (input_data[Offset(input_shape, batch, height, width, channel)] / rms);
64 }
65 }
66 }
67 }
68 }
69 else if (input_shape.DimensionsCount() == 3)
70 {
71 const int32_t heights = MatchingDim(input_shape, 0, output_shape, 0);
72 const int32_t widths = MatchingDim(input_shape, 1, output_shape, 1);
73 const int32_t channels = MatchingDim(input_shape, 2, output_shape, 2);
74
75 for (int32_t height = 0; height < heights; height++)
76 {
77 for (int32_t width = 0; width < widths; width++)
78 {
79 // normalize over last-axis
80 double square_sum = 0.0f;
81 for (int32_t channel = 0; channel < channels; channel++)
82 {
83 double input_val = input_data[(height * widths + width) * channels + channel];
84 square_sum += (input_val * input_val);
85 }
86 double rms = std::sqrt((square_sum / channels) + params.epsilon);
87 for (int32_t channel = 0; channel < channels; channel++)
88 {
89 double gamma = (single_gamma ? gamma_data[0] : gamma_data[channel]);
90 output_data[(height * widths + width) * channels + channel] =
91 gamma * (input_data[(height * widths + width) * channels + channel] / rms);
92 }
93 }
94 }
95 }
96 else
97 {
98 throw std::runtime_error("cker::RmsNorm: Unsupported input shape");
99 }
100}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::RmsNormParams::epsilon, MatchingDim(), Offset(), and output_shape.

Referenced by onert::backend::cpu::ops::RmsNormLayer::run().

◆ RoPE()

template<typename T >
void nnfw::cker::RoPE ( const RoPEMode  mode,
const Shape input_shape,
const T *  input_data,
const Shape sin_table_shape,
const T *  sin_table_data,
const Shape cos_table_shape,
const T *  cos_table_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 32 of file RoPE.h.

36{
37 if (input_shape.Dims(3) != sin_table_shape.Dims(3))
38 throw std::runtime_error("the dimension(3) of input and sin_table do not match");
39
40 if (input_shape.Dims(3) != cos_table_shape.Dims(3))
41 throw std::runtime_error("the dimension(3) of input and cos_table do not match");
42
43 const int32_t i0_n = MatchingDim(input_shape, 0, output_shape, 0);
44 const int32_t i1_n = MatchingDim(input_shape, 1, output_shape, 1);
45 const int32_t i2_n = MatchingDim(input_shape, 2, output_shape, 2);
46 const int32_t i3_n = MatchingDim(input_shape, 3, output_shape, 3);
47
48 if (i3_n % 2 != 0)
49 throw std::runtime_error("i3_n must be even number");
50
51 if (mode == RoPEMode::kGptNeox)
52 {
53 for (int32_t i0 = 0; i0 < i0_n; ++i0)
54 {
55 for (int32_t i1 = 0; i1 < i1_n; ++i1)
56 {
57 for (int32_t i2 = 0; i2 < i2_n; ++i2)
58 {
59 for (int32_t i3 = 0; i3 < i3_n / 2; ++i3)
60 {
61 const int32_t offset = Offset(input_shape, i0, i1, i2, i3);
62 const T x0 = input_data[offset];
63 const T x1 = input_data[offset + i3_n / 2];
64
65 output_data[offset] = x0 * cos_table_data[i3] - x1 * sin_table_data[i3];
66 output_data[offset + i3_n / 2] =
67 x0 * sin_table_data[i3 + i3_n / 2] + x1 * cos_table_data[i3 + i3_n / 2];
68 }
69 }
70 }
71 }
72 }
73 else
74 {
75 throw std::runtime_error("Unsupported RoPE mode");
76 }
77}

References nnfw::cker::Shape::Dims(), kGptNeox, MatchingDim(), offset(), Offset(), and output_shape.

◆ Round()

void nnfw::cker::Round ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 65 of file Round.h.

67{
68 const int flat_size = MatchingFlatSize(input_shape, output_shape);
69 for (int i = 0; i < flat_size; ++i)
70 {
71 // Note that this implementation matches that of tensorFlow tf.round
72 // and corresponds to the bankers rounding method.
73 // cfenv (for fesetround) is not yet supported universally on Android, so
74 // using a work around.
75 output_data[i] = RoundToNearest(input_data[i]);
76 }
77}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ round_nearest()

float nnfw::cker::round_nearest ( float  value)

Definition at line 29 of file ReduceMean.h.

30{
31 if (value < 0)
32 {
33 return static_cast<float>(static_cast<int>(value - 0.5f));
34 }
35 else
36 {
37 return static_cast<float>(static_cast<int>(value + 0.5f));
38 }
39}

Referenced by nnfw::cker::ReduceMean::ReduceOp().

◆ RoundToNearest()

float nnfw::cker::RoundToNearest ( float  value)
inline

Definition at line 31 of file Round.h.

32{
33 auto floor_val = std::floor(value);
34 auto diff = value - floor_val;
35 if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0)))
36 {
37 return floor_val;
38 }
39 else
40 {
41 return floor_val = floor_val + 1.0f;
42 }
43}

Referenced by Quantize(), Quantize(), Quantize(), and Round().

◆ Rsqrt()

void nnfw::cker::Rsqrt ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 59 of file Elementwise.h.

61{
62 const int size = MatchingFlatSize(input_shape, output_shape);
63 for (int i = 0; i < size; i++)
64 {
65 output_data[i] = 1.f / std::sqrt(input_data[i]);
66 }
67}

References MatchingFlatSize(), output_shape, and size.

◆ Select()

template<typename D , typename T >
void nnfw::cker::Select ( const Shape input_condition_shape,
const D *  input_condition_data,
const Shape input_x_shape,
const T *  input_x_data,
const Shape input_y_shape,
const T *  input_y_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 32 of file Select.h.

35{
36 const int64_t flatsize =
37 MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
38 for (int64_t i = 0; i < flatsize; ++i)
39 {
40 output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
41 }
42}

References MatchingFlatSize(), and output_shape.

◆ Sin()

void nnfw::cker::Sin ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 31 of file Elementwise.h.

33{
34 const int size = MatchingFlatSize(input_shape, output_shape);
35 for (int i = 0; i < size; i++)
36 {
37 output_data[i] = std::sin(input_data[i]);
38 }
39}

References MatchingFlatSize(), output_shape, and size.

◆ Slice() [1/2]

template<typename T >
void nnfw::cker::Slice ( const SliceParams op_params,
const Shape input_shape,
const T *  input_data,
T *  output_data 
)
inline

Definition at line 72 of file Slice.h.

74{
75 SequentialTensorWriter<T> writer(input_data, output_data);
76 return Slice(op_params, input_shape, &writer);
77}
void Slice(const SliceParams &op_params, const Shape &input_shape, SequentialTensorWriter< T > *writer)
Definition Slice.h:31

References Slice().

◆ Slice() [2/2]

template<typename T >
void nnfw::cker::Slice ( const SliceParams op_params,
const Shape input_shape,
SequentialTensorWriter< T > *  writer 
)
inline

Definition at line 31 of file Slice.h.

33{
34 // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
35 assert(op_params.begin_count <= 4);
36 assert(op_params.size_count <= 4);
37
38 const int begin_count = op_params.begin_count;
39 const int size_count = op_params.size_count;
40 // We front-pad the begin and size vectors.
41 const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
42 const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) ? input_shape.Dims(0)
43 : start_b + op_params.size[0];
44 const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
45 const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
46 ? input_shape.Dims(1)
47 : start_h + op_params.size[size_count - 3];
48 const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
49 const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
50 ? input_shape.Dims(2)
51 : start_w + op_params.size[size_count - 2];
52 const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
53 const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
54 ? input_shape.Dims(3)
55 : start_d + op_params.size[size_count - 1];
56
57 for (int in_b = start_b; in_b < stop_b; ++in_b)
58 {
59 for (int in_h = start_h; in_h < stop_h; ++in_h)
60 {
61 for (int in_w = start_w; in_w < stop_w; ++in_w)
62 {
63 const int len = stop_d - start_d;
64 if (len > 0)
65 writer->WriteN(Offset(input_shape, in_b, in_h, in_w, start_d), len);
66 }
67 }
68 }
69}
void WriteN(int position, int len)
Definition Utils.h:475
int8_t size_count
Definition Slice.cpp:34
int8_t begin_count
Definition Slice.cpp:32

References nnfw::cker::SliceParams::begin, nnfw::cker::SliceParams::begin_count, begin_count, nnfw::cker::Shape::Dims(), Offset(), nnfw::cker::SliceParams::size, nnfw::cker::SliceParams::size_count, size_count, and nnfw::cker::SequentialTensorWriter< T >::WriteN().

Referenced by Slice().

◆ Softmax() [1/3]

void nnfw::cker::Softmax ( const float *  in,
const int  input_size,
const int  batch_size,
const float  beta,
float *  out 
)
inline

Definition at line 79 of file SoftMax.h.

81{
82 assert(input_size > 0);
83
84 // For each batch
85 for (int b = 0; b < batch_size; b++)
86 {
87 // Find the max coeff.
88 float max_coeff = in[0];
89 for (int i = 1; i < input_size; i++)
90 {
91 if (in[i] > max_coeff)
92 max_coeff = in[i];
93 }
94
95 // Compute the normalized sum of exps.
96 float exp_sum = 0.0;
97 for (int i = 0; i < input_size; i++)
98 {
99 out[i] = std::exp((in[i] - max_coeff) * beta);
100 exp_sum += out[i];
101 }
102
103 // Divide by the sum of exps.
104 float reciprocal_sum_exp = 1.f / exp_sum;
105 for (int i = 0; i < input_size; i++)
106 {
107 out[i] *= reciprocal_sum_exp;
108 }
109
110 // Advance in and out pointers for the next batch.
111 in += input_size;
112 out += input_size;
113 }
114}

Referenced by onert::backend::cpu::ops::SoftMaxLayer::softmaxFloat32().

◆ Softmax() [2/3]

void nnfw::cker::Softmax ( const SoftmaxParams params,
const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 116 of file SoftMax.h.

118{
119 // Validate whether if shapes of input and output are the same
120 MatchingFlatSize(input_shape, output_shape);
121
122 const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
123 auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
124 // Compute the exponential first, removing the max coefficient for numerical
125 // stability.
126 out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
127 // We are separating out the exp function so that exp can be vectorized.
128 out_mat = out_mat.array().exp();
129 // Normalize to get the activations.
130 Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
131 out_mat.array().rowwise() *= scale;
132}

References nnfw::cker::SoftmaxParams::beta, MapAsMatrixWithLastDimAsRows(), MatchingFlatSize(), and output_shape.

◆ Softmax() [3/3]

template<typename In , typename Out >
void nnfw::cker::Softmax ( const SoftmaxParams params,
const Shape input_shape,
const In *  input_data,
const Shape output_shape,
Out *  output_data 
)
inline

Definition at line 159 of file SoftMax.h.

161{
162 const int trailing_dim = input_shape.DimensionsCount() - 1;
163 const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
164 const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
165
166 const int32_t clamp_max = std::numeric_limits<Out>::max();
167 const int32_t clamp_min = std::numeric_limits<Out>::min();
168 for (int i = 0; i < excluding_last_dim; ++i)
169 {
170 int32_t max_val = std::numeric_limits<In>::min();
171 // Find max quantized value.
172 for (int j = 0; j < last_dim; ++j)
173 {
174 max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
175 }
176
177 float sum_exp = 0.0f;
178 const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
179 const float *table_offset = &params.table[max_uint8 - max_val];
180 // Calculate normalizer sum(exp(x)).
181 for (int j = 0; j < last_dim; ++j)
182 {
183 sum_exp += table_offset[input_data[j]];
184 }
185
186 const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
187 // Normalize and quantize probabilities.
188 for (int j = 0; j < last_dim; ++j)
189 {
190 const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
191 const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
192 output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
193 }
194 input_data += last_dim;
195 output_data += last_dim;
196 }
197}

References nnfw::cker::Shape::DimensionsCount(), MatchingDim(), MatchingFlatSizeSkipDim(), output_shape, nnfw::cker::SoftmaxParams::scale, nnfw::cker::SoftmaxParams::table, and nnfw::cker::SoftmaxParams::zero_point.

◆ SpaceToBatchND()

template<typename T >
void nnfw::cker::SpaceToBatchND ( const SpaceToBatchParams params,
const Shape unextended_input_shape,
const T *  input_data,
const Shape unextended_block_shape_shape,
const int32_t *  block_shape_data,
const Shape unextended_padding_shape,
const int32_t *  paddings_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 31 of file SpaceToBatchND.h.

36{
37 assert(unextended_input_shape.DimensionsCount() <= 4);
38 assert(unextended_output_shape.DimensionsCount() <= 4);
39 const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
40 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
41
42 const int depth = input_shape.Dims(3);
43 const int input_width = input_shape.Dims(2);
44 const int input_height = input_shape.Dims(1);
45 const int input_batch_size = input_shape.Dims(0);
46
47 const int output_width = output_shape.Dims(2);
48 const int output_height = output_shape.Dims(1);
49 const int output_batch_size = output_shape.Dims(0);
50
51 const int block_shape_height = block_shape_data[0];
52 const int block_shape_width = block_shape_data[1];
53 const int padding_top = paddings_data[0];
54 const int padding_left = paddings_data[2];
55
56 // For uint8 quantized, the correct padding "zero value" is the output offset.
57 const int32_t pad_value = params.output_offset;
58
59 for (int out_b = 0; out_b < output_batch_size; ++out_b)
60 {
61 int input_batch = out_b % input_batch_size;
62 int shift_w = (out_b / input_batch_size) % block_shape_width;
63 int shift_h = (out_b / input_batch_size) / block_shape_width;
64 for (int out_h = 0; out_h < output_height; ++out_h)
65 {
66 for (int out_w = 0; out_w < output_width; ++out_w)
67 {
68 T *out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
69 if (out_h * block_shape_height + shift_h < padding_top ||
70 out_h * block_shape_height + shift_h >= padding_top + input_height ||
71 out_w * block_shape_width + shift_w < padding_left ||
72 out_w * block_shape_width + shift_w >= padding_left + input_width)
73 {
74 // This may not execute correctly when pad_value != 0 and T != uint8.
75 memset(out, pad_value, depth * sizeof(T));
76 }
77 else
78 {
79 const T *in =
80 input_data + Offset(input_shape, input_batch,
81 (out_h * block_shape_height + shift_h) - padding_top,
82 (out_w * block_shape_width + shift_w) - padding_left, 0);
83 memcpy(out, in, depth * sizeof(T));
84 }
85 }
86 }
87 }
88}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), nnfw::cker::SpaceToBatchParams::output_offset, and output_shape.

◆ SpaceToDepth()

template<typename T >
void nnfw::cker::SpaceToDepth ( const SpaceToDepthParams params,
const Shape unextended_input_shape,
const T *  input_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 30 of file SpaceToDepth.h.

32{
33 assert(unextended_input_shape.DimensionsCount() <= 4);
34 assert(unextended_output_shape.DimensionsCount() <= 4);
35 const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
36 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
37
38 const int output_depth = output_shape.Dims(3);
39 const int output_width = output_shape.Dims(2);
40 const int output_height = output_shape.Dims(1);
41
42 const int input_depth = input_shape.Dims(3);
43 const int batch_size = input_shape.Dims(0);
44
45 // Number of continuous values that we can copy in one interation.
46 const int stride = params.block_size * input_depth;
47
48 for (int batch = 0; batch < batch_size; ++batch)
49 {
50 for (int out_h = 0; out_h < output_height; ++out_h)
51 {
52 T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
53 for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
54 {
55 T *dst = output_ptr;
56 for (int out_w = 0; out_w < output_width; ++out_w)
57 {
58 memcpy(dst, input_data, stride * sizeof(T));
59 input_data += stride;
60 dst += output_depth;
61 }
62 output_ptr += stride;
63 }
64 }
65 }
66}

References nnfw::cker::SpaceToDepthParams::block_size, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

◆ Split()

template<typename Scalar >
void nnfw::cker::Split ( const SplitParams params,
const Shape input_shape,
const Scalar *  input_data,
const Shape output_shape,
Scalar *const *  output_data 
)

Definition at line 30 of file Split.h.

32{
33 const int split_dimensions = input_shape.DimensionsCount();
34 int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
35 int outputs_count = params.num_split;
36
37 int64_t outer_size = 1;
38 for (int i = 0; i < axis; ++i)
39 {
40 outer_size *= input_shape.Dims(i);
41 }
42 // For all output arrays,
43 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
44 int64_t base_inner_size = 1;
45 for (int i = axis + 1; i < split_dimensions; ++i)
46 {
47 base_inner_size *= input_shape.Dims(i);
48 }
49
50 const Scalar *input_ptr = input_data;
51 for (int k = 0; k < outer_size; k++)
52 {
53 for (int i = 0; i < outputs_count; ++i)
54 {
55 const int copy_size = output_shape.Dims(axis) * base_inner_size;
56 memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
57 input_ptr += copy_size;
58 }
59 }
60}

References nnfw::cker::SplitParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::SplitParams::num_split, and output_shape.

◆ SplitV()

template<typename Scalar >
void nnfw::cker::SplitV ( const SplitVParams params,
const Shape input_shape,
const Scalar *  input_data,
std::vector< nnfw::cker::Shape > &  output_shapes,
Scalar *const *  output_data 
)

Definition at line 30 of file SplitV.h.

32{
33 const int split_dimensions = input_shape.DimensionsCount();
34 int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
35 int outputs_count = params.num_split;
36
37 for (int i = 0; i < outputs_count; i++)
38 {
39 // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
40 for (int j = 0; j < split_dimensions; j++)
41 {
42 if (j != axis)
43 {
44 MatchingDim(output_shapes[i], j, input_shape, j);
45 }
46 }
47 }
48
49 int64_t outer_size = 1;
50 for (int i = 0; i < axis; ++i)
51 {
52 outer_size *= input_shape.Dims(i);
53 }
54 // For all output arrays,
55 // FlatSize() = outer_size * Dims(axis) * base_inner_size;
56 int64_t base_inner_size = 1;
57 for (int i = axis + 1; i < split_dimensions; ++i)
58 {
59 base_inner_size *= input_shape.Dims(i);
60 }
61
62 const Scalar *input_ptr = input_data;
63 int copy_size = 0;
64 for (int k = 0; k < outer_size; k++)
65 {
66 for (int i = 0; i < outputs_count; ++i)
67 {
68 copy_size = output_shapes[i].Dims(axis) * base_inner_size;
69 memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
70 input_ptr += copy_size;
71 }
72 }
73}

References nnfw::cker::SplitVParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), and nnfw::cker::SplitVParams::num_split.

◆ SqDiff()

template<typename T >
void nnfw::cker::SqDiff ( const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 63 of file SqDiff.h.

65{
66 assert(input1_shape.DimensionsCount() > 0 && input2_shape.DimensionsCount() > 0 &&
67 output_shape.DimensionsCount() > 0);
68 int outRank = output_shape.DimensionsCount();
69
70 switch (outRank)
71 {
72 case 4:
73 SQDIFF(4);
74 break;
75
76 case 3:
77 SQDIFF(3);
78 break;
79
80 case 2:
81 SQDIFF(2);
82 break;
83
84 case 1:
85 SQDIFF(1);
86 break;
87
88 default:
89 throw std::runtime_error("Support up to 4-D tensors at present");
90 break;
91 }
92}
#define SQDIFF(N)
Definition SqDiff.h:29

References nnfw::cker::Shape::DimensionsCount(), output_shape, and SQDIFF.

Referenced by onert::backend::cpu::ops::SqDiffLayer::SqDiffFloat32().

◆ SqDiffImpl()

template<typename T , int N>
void nnfw::cker::SqDiffImpl ( const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data,
NdArrayDesc< N > *  desc1_in,
NdArrayDesc< N > *  desc2_in,
NdArrayDesc< N > *  desc_out 
)

Definition at line 40 of file SqDiff.h.

43{
44 std::vector<int> input_iter;
45 input_iter.resize(N);
46 const auto output_dims = output_shape.DimsData();
47
48 // Copy dims to desc, calculating strides.
49 CopyDimsToDesc<N>(output_shape, desc_out);
50 NdArrayDescsForElementwiseBroadcast<N>(input1_shape, input2_shape, desc1_in, desc2_in);
51
52 do
53 {
54 int input1_indx = SubscriptToIndexGeneric(desc1_in, input_iter.data());
55 int input2_indx = SubscriptToIndexGeneric(desc2_in, input_iter.data());
56 int output_indx = SubscriptToIndexGeneric(desc_out, input_iter.data());
57 output_data[output_indx] = (input1_data[input1_indx] - input2_data[input2_indx]) *
58 (input1_data[input1_indx] - input2_data[input2_indx]);
59 } while (NextIndex(N, output_dims, input_iter.data()));
60}
int SubscriptToIndexGeneric(const NdArrayDesc< N > *desc, int *iter)
Definition Utils.h:264

References NextIndex(), output_shape, and SubscriptToIndexGeneric().

◆ Sqrt()

void nnfw::cker::Sqrt ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 101 of file Elementwise.h.

103{
104 const int flat_size = MatchingFlatSize(input_shape, output_shape);
105
106 for (int i = 0; i < flat_size; i++)
107 {
108 output_data[i] = std::sqrt(input_data[i]);
109 }
110}

References MatchingFlatSize(), and output_shape.

◆ Square()

void nnfw::cker::Square ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 112 of file Elementwise.h.

114{
115 const int flat_size = MatchingFlatSize(input_shape, output_shape);
116
117 for (int i = 0; i < flat_size; i++)
118 {
119 output_data[i] = input_data[i] * input_data[i];
120 }
121}

References MatchingFlatSize(), and output_shape.

◆ StartForAxis()

int nnfw::cker::StartForAxis ( const StridedSliceParams params,
const Shape input_shape,
int  axis 
)
inline

Definition at line 83 of file StridedSlice.h.

84{
85 const auto begin_mask = params.begin_mask;
86 const auto *start_indices = params.start_indices;
87 const auto *strides = params.strides;
88 // Begin with the specified index.
89 int start = start_indices[axis];
90
91 // begin_mask override
92 if (begin_mask & 1 << axis)
93 {
94 if (strides[axis] > 0)
95 {
96 // Forward iteration - use the first element. These values will get
97 // clamped below (Note: We could have set them to 0 and axis_size-1, but
98 // use lowest() and max() to maintain symmetry with StopForAxis())
99 start = std::numeric_limits<int>::lowest();
100 }
101 else
102 {
103 // Backward iteration - use the last element.
104 start = std::numeric_limits<int>::max();
105 }
106 }
107
108 // Handle negative indices
109 int axis_size = input_shape.Dims(axis);
110 if (start < 0)
111 {
112 start += axis_size;
113 }
114
115 // Clamping
116 start = Clamp(start, 0, axis_size - 1);
117
118 return start;
119}
int Clamp(const int v, const int lo, const int hi)

References nnfw::cker::StridedSliceParams::begin_mask, Clamp(), nnfw::cker::Shape::Dims(), nnfw::cker::StridedSliceParams::start_indices, and nnfw::cker::StridedSliceParams::strides.

Referenced by checkOutputSize(), and StridedSlice().

◆ StatelessRandomUniform()

void nnfw::cker::StatelessRandomUniform ( const Shape shape_shape,
const int32_t *  shape_data,
const Shape seed_shape,
const int32_t *  seed_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 75 of file StatelessRandomUniform.h.

78{
79 Tensor shape_t;
80 Tensor seed_t;
81
82 shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData());
83 shape_t.buffer = (void *)shape_data;
84
85 seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData());
86 seed_t.buffer = (void *)seed_data;
87
88 Tensor output_t;
89 output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
90 output_t.buffer = output_data;
91
94
95 GenerateKey(seed_t, &key, &counter);
96
97 Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
98 random::PhiloxRandom(counter, key), &output_t);
99}
void ReplaceWith(int dimensions_count, const int32_t *dims_data)
Definition Shape.h:130
void GenerateKey(Tensor seed, random::PhiloxRandom::Key *out_key, random::PhiloxRandom::ResultType *out_counter)

References nnfw::cker::Tensor::buffer, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), GenerateKey(), output_shape, nnfw::cker::Shape::ReplaceWith(), and nnfw::cker::Tensor::shape.

Referenced by onert::backend::cpu::ops::StatelessRandomUniformLayer::StatelessRandomUniformFloat32().

◆ StopForAxis()

int nnfw::cker::StopForAxis ( const StridedSliceParams params,
const Shape input_shape,
int  axis,
int  start_for_axis 
)
inline

Definition at line 126 of file StridedSlice.h.

128{
129 const auto end_mask = params.end_mask;
130 const auto shrink_axis_mask = params.shrink_axis_mask;
131 const auto *stop_indices = params.stop_indices;
132 const auto *strides = params.strides;
133
134 // Begin with the specified index
135 const bool shrink_axis = shrink_axis_mask & (1 << axis);
136 int stop = stop_indices[axis];
137
138 // When shrinking an axis, the end position does not matter (and can be
139 // incorrect when negative indexing is used, see Issue #19260). Always use
140 // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
141 // already been adjusted for negative indices.
142 if (shrink_axis)
143 {
144 stop = start_for_axis + 1;
145 }
146
147 // end_mask override
148 if (end_mask & (1 << axis))
149 {
150 if (strides[axis] > 0)
151 {
152 // Forward iteration - use the last element. These values will get
153 // clamped below
154 stop = std::numeric_limits<int>::max();
155 }
156 else
157 {
158 // Backward iteration - use the first element.
159 stop = std::numeric_limits<int>::lowest();
160 }
161 }
162
163 // Handle negative indices
164 const int axis_size = input_shape.Dims(axis);
165 if (stop < 0)
166 {
167 stop += axis_size;
168 }
169
170 // Clamping
171 // Because the end index points one past the last element, we need slightly
172 // different clamping ranges depending on the direction.
173 if (strides[axis] > 0)
174 {
175 // Forward iteration
176 stop = Clamp(stop, 0, axis_size);
177 }
178 else
179 {
180 // Backward iteration
181 stop = Clamp(stop, -1, axis_size - 1);
182 }
183
184 return stop;
185}

References Clamp(), nnfw::cker::Shape::Dims(), nnfw::cker::StridedSliceParams::end_mask, nnfw::cker::StridedSliceParams::shrink_axis_mask, nnfw::cker::StridedSliceParams::stop_indices, and nnfw::cker::StridedSliceParams::strides.

Referenced by checkOutputSize(), and StridedSlice().

◆ StridedSlice()

template<typename T >
void nnfw::cker::StridedSlice ( const StridedSliceParams op_params,
const Shape unextended_input_shape,
const T *  input_data,
const Shape unextended_output_shape,
T *  output_data 
)
inline

Definition at line 258 of file StridedSlice.h.

260{
261 assert(unextended_input_shape.DimensionsCount() <= 4);
262 assert(unextended_output_shape.DimensionsCount() <= 4);
263
264 bool optimize = true;
265 int st_count = op_params.strides_count;
266 for (int idx = 0; idx < st_count - 1; idx++)
267 {
268 const int axis_size = unextended_input_shape.Dims(idx);
269 const int start = StartForAxis(op_params, unextended_input_shape, idx);
270 const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
271 if ((axis_size != 1) && (start != 0 || stop != 0))
272 {
273 optimize = false;
274 break;
275 }
276 }
277
278 if (optimize)
279 {
280 if (op_params.strides[st_count - 1] == 1)
281 {
282 const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
283 const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
284
285 for (int idx = 0; idx < end - start; idx++)
286 {
287 output_data[idx] = input_data[idx + start];
288 }
289 return;
290 }
291 }
292
293 // Note that the output_shape is not used herein.
294 StridedSliceParams params_copy = op_params;
295
296 const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
297 const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
298
299 // Reverse and pad to 4 dimensions because that is what the runtime code
300 // requires (ie. all shapes must be 4D and are given backwards).
301 StridedSlicePadIndices(&params_copy, 4);
302
303 const int start_b = StartForAxis(params_copy, input_shape, 0);
304 const int stop_b = StopForAxis(params_copy, input_shape, 0, start_b);
305 const int start_h = StartForAxis(params_copy, input_shape, 1);
306 const int stop_h = StopForAxis(params_copy, input_shape, 1, start_h);
307 const int start_w = StartForAxis(params_copy, input_shape, 2);
308 const int stop_w = StopForAxis(params_copy, input_shape, 2, start_w);
309 const int start_d = StartForAxis(params_copy, input_shape, 3);
310 const int stop_d = StopForAxis(params_copy, input_shape, 3, start_d);
311
312 T *out_ptr = output_data;
313 for (int in_b = start_b; !LoopCondition(in_b, stop_b, params_copy.strides[0]);
314 in_b += params_copy.strides[0])
315 {
316 for (int in_h = start_h; !LoopCondition(in_h, stop_h, params_copy.strides[1]);
317 in_h += params_copy.strides[1])
318 {
319 for (int in_w = start_w; !LoopCondition(in_w, stop_w, params_copy.strides[2]);
320 in_w += params_copy.strides[2])
321 {
322 for (int in_d = start_d; !LoopCondition(in_d, stop_d, params_copy.strides[3]);
323 in_d += params_copy.strides[3])
324 {
325 *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)];
326 }
327 }
328 }
329 }
330}
bool LoopCondition(int index, int stop, int stride)
void StridedSlicePadIndices(StridedSliceParams *p, int dim_count)

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), end(), LoopCondition(), Offset(), output_shape, StartForAxis(), StopForAxis(), StridedSlicePadIndices(), nnfw::cker::StridedSliceParams::strides, and nnfw::cker::StridedSliceParams::strides_count.

◆ StridedSlicePadIndices()

void nnfw::cker::StridedSlicePadIndices ( StridedSliceParams p,
int  dim_count 
)
inline

Definition at line 42 of file StridedSlice.h.

43{
44 // Add indices and mask bits to fully include extra dimensions
45 assert(dim_count <= 4);
46 assert(dim_count >= p->start_indices_count);
48 assert(p->stop_indices_count == p->strides_count);
49
50 const int pad_count = dim_count - p->start_indices_count;
51
52 // Pad indices at start, so move arrays by pad_count.
53 for (int i = p->start_indices_count - 1; i >= 0; --i)
54 {
55 p->strides[i + pad_count] = p->strides[i];
56 p->start_indices[i + pad_count] = p->start_indices[i];
57 p->stop_indices[i + pad_count] = p->stop_indices[i];
58 }
59 for (int i = 0; i < pad_count; ++i)
60 {
61 p->start_indices[i] = 0;
62 p->stop_indices[i] = 1;
63 p->strides[i] = 1;
64 }
65
66 // Pad masks with 0s or 1s as required.
67 p->shrink_axis_mask <<= pad_count;
68 p->ellipsis_mask <<= pad_count;
69 p->new_axis_mask <<= pad_count;
70 p->begin_mask <<= pad_count;
71 p->end_mask <<= pad_count;
72 p->begin_mask |= (1 << pad_count) - 1;
73 p->end_mask |= (1 << pad_count) - 1;
74
75 p->start_indices_count = dim_count;
76 p->stop_indices_count = dim_count;
77 p->strides_count = dim_count;
78}

References nnfw::cker::StridedSliceParams::begin_mask, nnfw::cker::StridedSliceParams::ellipsis_mask, nnfw::cker::StridedSliceParams::end_mask, nnfw::cker::StridedSliceParams::new_axis_mask, nnfw::cker::StridedSliceParams::shrink_axis_mask, nnfw::cker::StridedSliceParams::start_indices, nnfw::cker::StridedSliceParams::start_indices_count, nnfw::cker::StridedSliceParams::stop_indices, nnfw::cker::StridedSliceParams::stop_indices_count, nnfw::cker::StridedSliceParams::strides, and nnfw::cker::StridedSliceParams::strides_count.

Referenced by StridedSlice().

◆ Sub1Vector()

void nnfw::cker::Sub1Vector ( const float *  vector,
int  v_size,
float *  result 
)
inline

Definition at line 115 of file TensorUtils.h.

116{
117 NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
118}

References NEON_OR_PORTABLE, and Sub1Vector().

Referenced by Sub1Vector(), and UpdateLstmCellFloat().

◆ SubscriptToIndex()

int nnfw::cker::SubscriptToIndex ( const NdArrayDesc< 4 > &  desc,
int  i0,
int  i1,
int  i2,
int  i3 
)
inline

◆ SubscriptToIndexGeneric()

template<int N>
int nnfw::cker::SubscriptToIndexGeneric ( const NdArrayDesc< N > *  desc,
int *  iter 
)
inline

Definition at line 264 of file Utils.h.

265{
266 int ret_indx = 0;
267 for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
268 {
269 assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
270 ret_indx += iter[idx] * desc->strides[idx];
271 }
272
273 return ret_indx;
274}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

Referenced by SqDiffImpl().

◆ sum_reducer()

template<typename In >
int nnfw::cker::sum_reducer ( const int  data1,
const In  data2 
)

Definition at line 46 of file ReduceMean.h.

47{
48 return data1 + static_cast<int>(data2);
49}

Referenced by MeanQ8Asymm().

◆ SymmetricQuantizeFloats()

void nnfw::cker::SymmetricQuantizeFloats ( const float *  values,
const int  size,
int8_t *  quantized_values,
float *  min,
float *  max,
float *  scaling_factor 
)
inline

Definition at line 120 of file TensorUtils.h.

122{
123 return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max,
124 scaling_factor);
125}

References NEON_OR_PORTABLE, size, and SymmetricQuantizeFloats().

Referenced by FullyConnectedHybrid(), and SymmetricQuantizeFloats().

◆ Tanh()

void nnfw::cker::Tanh ( const Shape input_shape,
const float *  input_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 31 of file Tanh.h.

33{
34 auto input_map = MapAsVector(input_data, input_shape);
35 auto output_map = MapAsVector(output_data, output_shape);
36 output_map.array() = input_map.array().tanh();
37}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ TFLITE_COMPARISON_OP() [1/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Equal  )

◆ TFLITE_COMPARISON_OP() [2/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Greater  )

◆ TFLITE_COMPARISON_OP() [3/6]

nnfw::cker::TFLITE_COMPARISON_OP ( GreaterEqual  )

◆ TFLITE_COMPARISON_OP() [4/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Less  )

◆ TFLITE_COMPARISON_OP() [5/6]

nnfw::cker::TFLITE_COMPARISON_OP ( LessEqual  )

◆ TFLITE_COMPARISON_OP() [6/6]

nnfw::cker::TFLITE_COMPARISON_OP ( NotEqual  )

◆ TileOneDimension()

template<typename T , typename M >
std::pair< int, int > nnfw::cker::TileOneDimension ( const Shape in_dimensions,
const T *  in_data,
const M *  multipliers,
T *  out_data,
int  dimension 
)

Definition at line 41 of file Tile.h.

43{
44 const int dimension_size = in_dimensions.Dims(dimension);
45 if (dimension == in_dimensions.DimensionsCount() - 1)
46 {
47 CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
48 return std::make_pair(dimension_size,
49 dimension_size * static_cast<int>(multipliers[dimension]));
50 }
51 int total_stride_size = 0, total_tiled_stride_size = 0;
52 const T *copy_from_data = in_data;
53 T *copy_to_data = out_data;
54 for (int i = 0; i < dimension_size; ++i)
55 {
56 int stride_size = 0, tiled_stride_size = 0;
57 std::tie(stride_size, tiled_stride_size) =
58 TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
59 copy_from_data += stride_size;
60 copy_to_data += tiled_stride_size;
61 total_stride_size += stride_size;
62 total_tiled_stride_size += tiled_stride_size;
63 }
64 CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1,
65 out_data + total_tiled_stride_size);
66 return std::make_pair(total_stride_size,
67 static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
68}
void CopyMultipleTimes(const T *in_data, int32_t in_size, M multiplier, T *out_data)
Definition Tile.h:29

References CopyMultipleTimes(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and TileOneDimension().

Referenced by TileOneDimension().

◆ To32Bit()

template<typename TensorType >
TTypes< typenameTensorType::Scalar, TensorType::NumIndices >::Tensor32Bit nnfw::cker::To32Bit ( TensorType  in)

Definition at line 178 of file Tensor.h.

179{
181 return RetType(in.data(), To32BitDims(in.dimensions()));
182}
Eigen::DSizes< Index32, DSizes::count > To32BitDims(const DSizes &in)
Definition Tensor.h:166
Eigen::TensorMap< Eigen::Tensor< T, NDIMS, Eigen::RowMajor, int >, Eigen::Aligned > Tensor32Bit
Definition Tensor.h:43

References To32BitDims().

Referenced by nnfw::cker::functor::BroadcastTo< Device, T >::DoBCast32Bit().

◆ To32BitDims()

template<typename DSizes >
Eigen::DSizes< Index32, DSizes::count > nnfw::cker::To32BitDims ( const DSizes &  in)

Definition at line 166 of file Tensor.h.

167{
168 Eigen::DSizes<Index32, DSizes::count> out;
169 for (int i = 0; i < DSizes::count; ++i)
170 {
171 out[i] = in[i];
172 }
173 return out;
174}

Referenced by To32Bit().

◆ Transpose()

template<typename T >
void nnfw::cker::Transpose ( const TransposeParams unshrunk_params,
const Shape unshrunk_input_shape,
const T *  input_data,
const Shape unshrunk_output_shape,
T *  output_data 
)

Definition at line 509 of file Transpose.h.

511{
512 const int output_size = unshrunk_output_shape.DimensionsCount();
513 assert(unshrunk_input_shape.DimensionsCount() <= 4);
514 assert(output_size <= 4);
515 assert(output_size == unshrunk_params.perm_count);
516
517 Shape shrunk_input_shape = Shape(unshrunk_input_shape);
518
519 Shape shrunk_output_shape = Shape(unshrunk_output_shape);
520
521 TransposeParams shrunk_params = unshrunk_params;
522
523 // Reduce any dimensions that have one size. Lower transpose op usually
524 // performs better since memory access patterns will be improved.
525 RemoveOneSizeDimensions(&shrunk_input_shape, &shrunk_output_shape, &shrunk_params);
526
527 // Handle identity cases.
528 // TODO(b/140779653): Add an optimization pass in the conversion process to
529 // remove transpose op nodes where they do nothing like the below one.
530 bool identical = true;
531 for (int i = 0; i < shrunk_params.perm_count; ++i)
532
533 {
534 if (shrunk_params.perm[i] != i)
535
536 {
537 identical = false;
538 break;
539 }
540 }
541 if (identical)
542 {
543 memcpy(output_data, input_data, unshrunk_input_shape.FlatSize() * sizeof(T));
544 return;
545 }
546
547 // Reduce dimensions by flattening.
548 if (shrunk_params.perm[0] == 0 && output_size >= 3)
549
550 {
551 Shape non_flatten_input_shape;
552 Shape non_flatten_output_shape;
553 TransposeParams non_flatten_params;
554 const int total_size = shrunk_input_shape.FlatSize();
555
556 const int non_flatten_size =
557 Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
558
559 &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
560 assert(non_flatten_params.perm[0] != 0);
561
562 for (int i = 0; i < total_size; i += non_flatten_size)
563 {
564 TransposeImpl(non_flatten_params, non_flatten_input_shape, input_data + i,
565 non_flatten_output_shape, output_data + i);
566 }
567 return;
568 }
569
570 // Call non-flattened case.
571 TransposeImpl(shrunk_params, shrunk_input_shape, input_data, shrunk_output_shape,
572
573 output_data);
574}
void TransposeImpl(const TransposeParams &params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data)
Definition Transpose.h:33

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::FlatSize(), nnfw::cker::TransposeParams::perm, nnfw::cker::TransposeParams::perm_count, and TransposeImpl().

Referenced by onert::backend::cpu::ops::TransposeLayer::transpose().

◆ Transpose2D()

template<typename T >
void nnfw::cker::Transpose2D ( const Shape input_shape,
const T *  input_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 297 of file Transpose.h.

299{
300 assert(input_shape.DimensionsCount() == 2);
301 assert(output_shape.DimensionsCount() == 2);
302
303 const int d0 = input_shape.DimsData()[0];
304 const int d1 = input_shape.DimsData()[1];
305 const int kLines = 4;
306 const int kSkipSize = (kLines - 1) * d1;
307
308 const T *input = input_data;
309
310 int i = 0;
311 for (; i <= d0 - kLines; i += kLines)
312 {
313 T *output = output_data + i;
314
315 const T *input_ptr = input;
317 input_ptr += d1;
319 input_ptr += d1;
321 input_ptr += d1;
323
324 int j = 0;
325 for (; j <= d1 - kLines; j += kLines)
326 {
327 input_ptr = input;
328 const T a00 = input_ptr[0];
329 const T a01 = input_ptr[1];
330 const T a02 = input_ptr[2];
331 const T a03 = input_ptr[3];
332 input_ptr += d1;
333 const T a10 = input_ptr[0];
334 const T a11 = input_ptr[1];
335 const T a12 = input_ptr[2];
336 const T a13 = input_ptr[3];
337 input_ptr += d1;
338 const T a20 = input_ptr[0];
339 const T a21 = input_ptr[1];
340 const T a22 = input_ptr[2];
341 const T a23 = input_ptr[3];
342 input_ptr += d1;
343 const T a30 = input_ptr[0];
344 const T a31 = input_ptr[1];
345 const T a32 = input_ptr[2];
346 const T a33 = input_ptr[3];
347
348 output[0] = a00;
349 output[1] = a10;
350 output[2] = a20;
351 output[3] = a30;
352 output += d0;
353
354 output[0] = a01;
355 output[1] = a11;
356 output[2] = a21;
357 output[3] = a31;
358 output += d0;
359
360 output[0] = a02;
361 output[1] = a12;
362 output[2] = a22;
363 output[3] = a32;
364 output += d0;
365
366 output[0] = a03;
367 output[1] = a13;
368 output[2] = a23;
369 output[3] = a33;
370 output += d0;
371
372 input += kLines;
373 }
374 if (j == d1)
375 {
376 input += kSkipSize;
377 }
378 else
379 {
380 for (int p = 0; p < kLines; ++p)
381 {
382 for (int q = 0; q < d1 - j; ++q)
383 {
384 *(output + q * d0 + p) = *(input + p * d1 + q);
385 }
386 }
387 input += (d1 - j) + kSkipSize;
388 }
389 }
390 for (; i < d0; ++i)
391 {
392 T *output = output_data + i;
393 for (int j = 0; j < d1; ++j)
394 {
395 *output = *input;
396 output += d0;
397 ++input;
398 }
399 }
400}
void optimized_ops_preload_l1_keep(const T *ptr)
Definition Utils.h:455

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), optimized_ops_preload_l1_keep(), and output_shape.

Referenced by TransposeImpl().

◆ Transpose3D()

template<typename T >
void nnfw::cker::Transpose3D ( const TransposeParams params,
const Shape input_shape,
const T *  input_data,
const Shape ,
T *  output_data 
)
inline

Definition at line 405 of file Transpose.h.

407{
408 int s2, s3;
409 s2 = input_shape.Dims(1);
410 s3 = input_shape.Dims(2);
411
412 int p1 = 0;
413 int p2 = 0;
414 int p3 = 0;
415
416 if (params.perm[0] == 2)
417 {
418 p1 = 1;
419 }
420 else if (params.perm[1] == 2)
421 {
422 p2 = 1;
423 }
424 else
425 {
426 p3 = 1;
427 }
428
429 if (params.perm[0] == 1)
430 {
431 p1 = s3;
432 }
433 else if (params.perm[1] == 1)
434 {
435 p2 = s3;
436 }
437 else
438 {
439 p3 = s3;
440 }
441
442 if (params.perm[0] == 0)
443 {
444 p1 = s2 * s3;
445 }
446 else if (params.perm[1] == 0)
447 {
448 p2 = s2 * s3;
449 }
450 else
451 {
452 p3 = s2 * s3;
453 }
454
455 int o_s[3];
456 o_s[0] = input_shape.Dims(params.perm[0]);
457 o_s[1] = input_shape.Dims(params.perm[1]);
458 o_s[2] = input_shape.Dims(params.perm[2]);
459
460 for (int i1 = 0; i1 < o_s[0]; ++i1)
461 {
462 for (int i2 = 0; i2 < o_s[1]; ++i2)
463 {
464 for (int i3 = 0; i3 < o_s[2]; ++i3)
465 {
466 const int i = i1 * p1 + i2 * p2 + i3 * p3;
467 const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3;
468 output_data[o] = input_data[i];
469 }
470 }
471 }
472}

References nnfw::cker::Shape::Dims(), and nnfw::cker::TransposeParams::perm.

Referenced by TransposeImpl().

◆ TransposeConv()

void nnfw::cker::TransposeConv ( const TransposeConvParams params,
const Shape input_shape,
const float *  input_data,
const Shape filter_shape,
const float *  filter_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 30 of file TransposeConv.h.

33{
34
35 const int stride_width = params.stride_width;
36 const int stride_height = params.stride_height;
37 const int pad_width = params.padding_values.width;
38 const int pad_height = params.padding_values.height;
39
40 assert(input_shape.DimensionsCount() == 4);
41 assert(filter_shape.DimensionsCount() == 4);
42 assert(output_shape.DimensionsCount() == 4);
43
44 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
45 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
46 const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
47 const int input_height = input_shape.Dims(1);
48 const int input_width = input_shape.Dims(2);
49 const int filter_height = filter_shape.Dims(1);
50 const int filter_width = filter_shape.Dims(2);
51 const int output_height = output_shape.Dims(1);
52 const int output_width = output_shape.Dims(2);
53
54 // Although transpose convolution simplifies to convolution with transposed
55 // weights for strides of 1, non-unitary striding complicates matters. To
56 // keep this reference implementation as clear as possible, we use a
57 // "scatter" access pattern, where we loop through all the input elements,
58 // computing their influence on the output, rather than looping through the
59 // output elements in the typical "gather" access pattern of a conv. We
60 // therefore must initialize the output array to zero.
61 const int num_elements = output_shape.FlatSize();
62 for (int i = 0; i < num_elements; i++)
63 {
64 output_data[i] = 0.0f;
65 }
66
67 // Loop through input elements one at a time.
68 for (int batch = 0; batch < batches; ++batch)
69 {
70 for (int in_y = 0; in_y < input_height; ++in_y)
71 {
72 for (int in_x = 0; in_x < input_width; ++in_x)
73 {
74 for (int in_channel = 0; in_channel < input_depth; ++in_channel)
75 {
76 // Loop through the output elements it will influence
77 const int out_x_origin = (in_x * stride_width) - pad_width;
78 const int out_y_origin = (in_y * stride_height) - pad_height;
79 for (int filter_y = 0; filter_y < filter_height; ++filter_y)
80 {
81 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
82 {
83 for (int out_channel = 0; out_channel < output_depth; ++out_channel)
84 {
85 // Compute output element location
86 const int out_x = out_x_origin + filter_x;
87 const int out_y = out_y_origin + filter_y;
88 // We cannot accumulate out of bounds
89 if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
90 (out_y < output_height))
91 {
92 float input_value =
93 input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
94 float filter_value =
95 filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
96 output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
97 input_value * filter_value;
98 }
99 }
100 }
101 }
102 }
103 }
104 }
105 }
106}
PaddingValues padding_values
Definition Types.h:333

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::TransposeConvParams::padding_values, nnfw::cker::TransposeConvParams::stride_height, nnfw::cker::TransposeConvParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ TransposeImpl()

template<typename T >
void nnfw::cker::TransposeImpl ( const TransposeParams params,
const Shape input_shape,
const T *  input_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 475 of file Transpose.h.

477{
478 const int dims_cnt = input_shape.DimensionsCount();
479
480 int dim0, dim1;
481 if (IsTranspose2DApplicable(params, input_shape, &dim0, &dim1))
482 {
483 Transpose2D(Shape({dim0, dim1}), input_data, Shape({dim1, dim0}), output_data);
484 return;
485 }
486
487 // TODO(b/141217325): notably Eigen is better suited for
488 // larger inputs whereas Transpose3D is generally
489 // better for smaller ones.
490 //
491 // E.g. on Nexus 5, Eigen is better for size 96^3 and up
492 // and Transpose3D is better for 72^3 and down.
493 //
494 // 96^3 is not mobile-friendly for certain usecases
495 // (e.g. model used in beam search for seq2seq) but is in others.
496 // Consider tradeoffs.
497 if (dims_cnt == 3)
498 {
499 Transpose3D(params, input_shape, input_data, output_shape, output_data);
500 return;
501 }
502
503 // Reroute to the reference version if an optimized method for the given data
504 // is not available.
505 reference::Transpose(params, input_shape, input_data, output_shape, output_data);
506}
void Transpose3D(const TransposeParams &params, const Shape &input_shape, const T *input_data, const Shape &, T *output_data)
Definition Transpose.h:405
void Transpose2D(const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
Definition Transpose.h:297

References nnfw::cker::Shape::DimensionsCount(), output_shape, nnfw::cker::reference::Transpose(), Transpose2D(), and Transpose3D().

Referenced by Transpose().

◆ Unpack()

template<typename Scalar >
void nnfw::cker::Unpack ( const UnpackParams params,
const Shape input_shape,
const Scalar *  input_data,
const Shape output_shape,
Scalar *const *  output_datas 
)

Definition at line 30 of file Unpack.h.

32{
33 const int dimensions = input_shape.DimensionsCount();
34 const int outputs_count = params.num_split;
35
36 int outer_size = 1;
37 for (int i = 0; i < params.axis; i++)
38 {
39 outer_size *= input_shape.Dims(i);
40 }
41 int copy_size = 1;
42 for (int i = params.axis + 1; i < dimensions; i++)
43 {
44 copy_size *= input_shape.Dims(i);
45 }
46 assert(output_shape.FlatSize() == copy_size * outer_size);
47
48 for (int i = 0; i < outputs_count; ++i)
49 {
50 for (int k = 0; k < outer_size; k++)
51 {
52 Scalar *output_ptr = output_datas[i] + copy_size * k;
53 int loc = k * outputs_count * copy_size + i * copy_size;
54 memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
55 }
56 }
57}

References nnfw::cker::UnpackParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::UnpackParams::num_split, and output_shape.

◆ UpdateLstmCellFloat()

void nnfw::cker::UpdateLstmCellFloat ( int  n_batch,
int  n_cell,
float *  cell_state,
const float *  input_gate,
float *  forget_gate,
const float *  cell_gate,
bool  use_cifg,
float  clip 
)

Definition at line 135 of file LSTM.h.

137{
138 // Define variable for 4th argument to avoid warning
139 // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
140 const float *cwise_product_rhs = cell_state;
141 VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state);
142
143 if (use_cifg)
144 {
145 // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
146 // scratch, as input_gate array is not allocated in this case. (Be careful
147 // not to write to the scratch before reading the forget gate data.)
148 float *scratch = forget_gate;
149 Sub1Vector(forget_gate, n_batch * n_cell, scratch);
150 VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state);
151 }
152 else
153 {
154 VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state);
155 }
156 if (clip > 0.0f)
157 {
158 CwiseClipping(cell_state, n_batch * n_cell, clip);
159 }
160}
void Sub1Vector(const float *vector, int v_size, float *result)
void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1, const T *__restrict__ vector2, int v_size, T *__restrict__ result)
Definition TensorUtils.h:64

References CwiseClipping(), Sub1Vector(), VectorVectorCwiseProduct(), and VectorVectorCwiseProductAccumulate().

Referenced by LstmStepFloat().

◆ ValidateGemmParams()

template<typename AccumScalar , typename DstScalar , QuantizationFlavor quantization_flavor>
void nnfw::cker::ValidateGemmParams ( const GemmParams< AccumScalar, DstScalar, quantization_flavor > &  params)

Definition at line 544 of file Types.h.

546{
547 // Guard consistency of the quantized multiplier fields.
548 if (quantization_flavor == QuantizationFlavor::kFloatingPoint)
549 {
550 assert(!params.multiplier_fixedpoint);
551 assert(!params.multiplier_exponent);
552 assert(!params.multiplier_fixedpoint_perchannel);
553 assert(!params.multiplier_exponent_perchannel);
554 }
555 else if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier &&
556 !std::is_same<DstScalar, int32_t>::value)
557 {
558 assert(params.multiplier_fixedpoint);
559 // Nothing to check about multiplier_exponent
560 assert(!params.multiplier_fixedpoint_perchannel);
561 assert(!params.multiplier_exponent_perchannel);
562 }
563 else if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier &&
564 !std::is_same<DstScalar, int32_t>::value)
565 {
566 assert(!params.multiplier_fixedpoint);
567 assert(!params.multiplier_exponent);
569 assert(params.multiplier_exponent_perchannel);
570 }
571 else
572 {
573 // For the get raw accumulator case, we should make sure none of the
574 // quantization params are set.
575 assert(!params.multiplier_fixedpoint);
576 assert(!params.multiplier_exponent);
577 assert(!params.multiplier_fixedpoint_perchannel);
578 assert(!params.multiplier_exponent_perchannel);
579 }
580}
AccumScalar multiplier_fixedpoint
Definition Types.h:513
const int * multiplier_exponent_perchannel
Definition Types.h:529
const AccumScalar * multiplier_fixedpoint_perchannel
Definition Types.h:521

References kFloatingPoint, kIntegerWithPerRowMultiplier, and kIntegerWithUniformMultiplier.

◆ VectorBatchVectorAdd()

void nnfw::cker::VectorBatchVectorAdd ( const float *  vector,
int  v_size,
int  n_batch,
float *  batch_vector 
)
inline

Definition at line 39 of file TensorUtils.h.

40{
41 PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
42}
void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector)

References PortableVectorBatchVectorAdd().

Referenced by CalculateLstmGateFloat().

◆ VectorBatchVectorAssign()

void nnfw::cker::VectorBatchVectorAssign ( const float *  vector,
int  v_size,
int  n_batch,
float *  batch_vector 
)
inline

Definition at line 44 of file TensorUtils.h.

46{
47 PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
48}
void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)

References PortableVectorBatchVectorAssign().

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().

◆ VectorBatchVectorCwiseProduct()

template<typename T >
void nnfw::cker::VectorBatchVectorCwiseProduct ( const T *  vector,
int  v_size,
const T *  batch_vector,
int  n_batch,
T *  result 
)
inline

Definition at line 76 of file TensorUtils.h.

78{
79 for (int b = 0; b < n_batch; b++)
80 {
81 VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
82 // Update the pointers.
83 result += v_size;
84 batch_vector += v_size;
85 }
86}

References VectorVectorCwiseProduct().

Referenced by CalculateLstmGateFloat().

◆ VectorBatchVectorCwiseProductAccumulate()

template<typename T >
void nnfw::cker::VectorBatchVectorCwiseProductAccumulate ( const T *  vector,
int  v_size,
const T *  batch_vector,
int  n_batch,
T *  result 
)
inline

Definition at line 92 of file TensorUtils.h.

94{
95 for (int b = 0; b < n_batch; b++)
96 {
97 VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
98 // Update the pointers.
99 result += v_size;
100 batch_vector += v_size;
101 }
102}

References VectorVectorCwiseProductAccumulate().

Referenced by CalculateLstmGateFloat().

◆ VectorVectorCwiseProduct()

template<typename T >
void nnfw::cker::VectorVectorCwiseProduct ( const T *__restrict__  vector1,
const T *__restrict__  vector2,
int  v_size,
T *__restrict__  result 
)
inline

Definition at line 52 of file TensorUtils.h.

54{
55 for (int v = 0; v < v_size; v++)
56 {
57 *result++ = *vector1++ * *vector2++;
58 }
59}

Referenced by CalculateLstmOutputFloat(), UpdateLstmCellFloat(), and VectorBatchVectorCwiseProduct().

◆ VectorVectorCwiseProductAccumulate()

template<typename T >
void nnfw::cker::VectorVectorCwiseProductAccumulate ( const T *__restrict__  vector1,
const T *__restrict__  vector2,
int  v_size,
T *__restrict__  result 
)
inline

Definition at line 64 of file TensorUtils.h.

67{
68 for (int v = 0; v < v_size; v++)
69 {
70 *result++ += *vector1++ * *vector2++;
71 }
72}

Referenced by UpdateLstmCellFloat(), and VectorBatchVectorCwiseProductAccumulate().

◆ ZeroVector()

void nnfw::cker::ZeroVector ( float *  vector,
int  v_size 
)
inline

Definition at line 160 of file TensorUtils.h.

160{ PortableZeroVector(vector, v_size); }
void PortableZeroVector(float *vector, int v_size)

References PortableZeroVector().

Referenced by FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().