Namespaces
namespace	bias_op

namespace	cpu_backend_threadpool

namespace	depthwise_conv_op

namespace	detail

namespace	eigen_support

namespace	functor

namespace	gemm_support

namespace	multithreaded

namespace	optimized

namespace	optimized_integer_ops

namespace	random

namespace	reference

namespace	reference_integer_ops

namespace	ruy_support

namespace	train

namespace	training_ops

namespace	xent_ops

Data Structures
class	ActivationFunctor

class	BatchMatMul

struct	BatchMatMulParams

class	BCast

class	BCastList

struct	BinaryArithmeticOpParam

struct	ComparisonParams

struct	ConcatenationParams

class	Conv

struct	ConvHybridTempArena

struct	ConvParams

struct	DepthwiseConvParams

struct	DepthwiseConvWorkerTask

class	Einsum

class	FCTempArena

struct	FullyConnectedParams

class	FusedBatchNorm

struct	FusedBatchNormParams

struct	GatherParams

struct	GemmParams

struct	InputTensor

struct	InstanceNormParams

struct	is_quant8

struct	L2NormParams

struct	LeakyReluParams

struct	LSTMParams

class	MatMulBCast

struct	MatrixParams

struct	MaximumOp

struct	MinimumOp

struct	NdArrayDesc

struct	PackParams

struct	PaddingValues

struct	PadParams

struct	PoolParams

class	Reduce

class	ReduceMean

struct	ResizeBilinearParams

struct	RmsNormParams

class	SequentialTensorWriter

class	Shape

struct	ShapeIterator

struct	SliceParams

struct	SoftmaxParams

struct	SpaceToBatchParams

struct	SpaceToDepthParams

struct	SplitParams

struct	SplitVParams

struct	StridedSliceParams

struct	Tensor

struct	TransposeConvParams

struct	TransposeParams

struct	TTypes

struct	UnpackParams

struct	UNUSED_ALL

Typedefs
template<typename Scalar >
using	VectorMap = typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, 1 > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, 1 > > >::type

template<typename Scalar >
using	MatrixMap = typename std::conditional< std::is_const< Scalar >::value, Eigen::Map< const Eigen::Matrix< typename std::remove_const< Scalar >::type, Eigen::Dynamic, Eigen::Dynamic > >, Eigen::Map< Eigen::Matrix< Scalar, Eigen::Dynamic, Eigen::Dynamic > > >::type

template<typename T >
using	ComparisonFn = bool(*)(T, T)

using	ShapeVec = std::vector< int32_t >

using	Labels = std::vector< int32_t >

using	OperandLabels = std::vector< Labels >

using	LabelCounts = std::vector< int32_t >

using	OperandLabelCounts = std::vector< LabelCounts >

using	LabelToDimSizes = std::vector< int32_t >

typedef Eigen::ThreadPoolDevice	CPUDevice

typedef TTypes< float, 1 >::Tensor32Bit::Index	Index32

Enumerations
enum	DimensionType { kBroadcasting = 0 , kBatch = 1 , kFree = 2 , kContract = 3 , kReduce = 4 }

enum class	FusedActivationFunctionType { kNone = 0 , kRelu6 = 1 , kRelu1 = 2 , kRelu = 3 , kTanh = 4 , kSigmoid = 6 }

enum class	PaddingType { kNone = 0 , kSame = 1 , kValid = 2 }

enum class	BinaryArithmeticOpType { ADD = 0 , SUB = 1 , MUL = 2 , DIV = 3 , POW = 4 }

enum class	ComparisonOpType { Equal , NotEqual , Greater , GreaterEqual , Less , LessEqual }

enum class	RoPEMode { kGptNeox = 0 , kGptJ = 1 }

enum class	BroadcastableOpCategory : uint8_t { kNone , kNonBroadcast , kFirstInputBroadcastsFast , kSecondInputBroadcastsFast , kGenericBroadcast }

enum	LSTMKernelType { kTfLiteLSTMFullKernel = 0 , kTfLiteLSTMBasicKernel }

enum class	Order { kColMajor , kRowMajor }

enum class	CachePolicy : std::uint8_t { kNeverCache , kCacheIfLargeSpeedup , kAlwaysCache }

enum class	QuantizationFlavor { kFloatingPoint , kIntegerWithUniformMultiplier , kIntegerWithPerRowMultiplier }

Functions
template<typename Scalar >
VectorMap< Scalar >	MapAsVector (Scalar *data, const Shape &shape)

template<typename Scalar >
MatrixMap< Scalar >	MapAsMatrixWithLastDimAsRows (Scalar *data, const Shape &shape)

template<typename T >
void	AddN (const Shape &input_shape, const size_t num_inputs, const T *input_data, T output_data)

template<typename T1 , typename T2 , typename Cmp >
void	ArgMinMax (const Shape &input1_shape, const T1 input1_data, const Shape &output_shape, T2 output_data, int32_t axis, const Cmp &cmp)

template<typename T >
void	AveragePool (const PoolParams &, const Shape &, const T , const Shape &, T )

template<>
void	AveragePool< float > (const PoolParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	AveragePool16 (const PoolParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

void	AveragePool32 (const PoolParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

template<>
void	AveragePool< uint8_t > (const PoolParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

template<>
void	AveragePool< int8_t > (const PoolParams &params, const Shape &input_shape, const int8_t input_data, const Shape &output_shape, int8_t output_data)

void	GetIndexRange (int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim, int start_index, int end_index)

template<typename T >
void	BatchToSpaceND (const Shape &unextended_input1_shape, const T input1_data, const int32_t block_shape_data, const int32_t crops_data, const Shape &unextended_output_shape, T output_data)

bool	ProcessBroadcastShapes (const Shape &shape0, const Shape &shape1, BinaryArithmeticOpParam *params)

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&!std::is_same< T, bool >::value >	BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value &&std::is_same< T, bool >::value >	BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value >	BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<BinaryArithmeticOpType op_type>
void	BinaryArithmeticOp (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float input1_data, const Shape &input2_shape, const float input2_data, const Shape &output_shape, float *output_data)

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t<!is_quant8< T >::value >	BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<BinaryArithmeticOpType op_type, typename T >
std::enable_if_t< is_quant8< T >::value >	BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<BinaryArithmeticOpType op_type>
void	BroadcastBinaryArithmeticOp (BinaryArithmeticOpParam &params, const Shape &input1_shape, const float input1_data, const Shape &input2_shape, const float input2_data, const Shape &output_shape, float *output_data)

template<typename T >
void	BroadcastTo (const Shape &input_shape, T input_data, const Shape &output_shape, T output_data)

void	BiasAndClamp (float clamp_min, float clamp_max, int bias_size, const float bias_data, int array_size, float array_data)

template<typename T >
bool	EqualFn (T lhs, T rhs)

template<typename T >
bool	NotEqualFn (T lhs, T rhs)

template<typename T >
bool	GreaterFn (T lhs, T rhs)

template<typename T >
bool	GreaterEqualFn (T lhs, T rhs)

template<typename T >
bool	LessFn (T lhs, T rhs)

template<typename T >
bool	LessEqualFn (T lhs, T rhs)

template<typename T , ComparisonFn< T > F>
void	ComparisonImpl (const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, bool *output_data)

template<ComparisonFn< float > F>
void	Comparison (const Shape &input1_shape, const float input1_data, const Shape &input2_shape, const float input2_data, const Shape &output_shape, bool *output_data)

template<typename T , ComparisonFn< int32_t > F>
void	ComparisonWithScaling (ComparisonParams &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, bool *output_data)

template<typename T , ComparisonFn< T > F>
void	BroadcastComparison4DSlowImpl (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, bool *output_data)

template<typename T , ComparisonFn< T > F>
void	BroadcastComparison4DSlow (const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, bool *output_data)

template<typename T , ComparisonFn< int32_t > F>
void	BroadcastComparison4DSlowWithScaling (ComparisonParams &params, const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, bool *output_data)

	TFLITE_COMPARISON_OP (Equal)

	TFLITE_COMPARISON_OP (NotEqual)

	TFLITE_COMPARISON_OP (Greater)

	TFLITE_COMPARISON_OP (GreaterEqual)

	TFLITE_COMPARISON_OP (Less)

	TFLITE_COMPARISON_OP (LessEqual)

template<typename Scalar >
void	Concatenation (const ConcatenationParams &params, const Shape const input_shapes, const Scalar const input_data, const Shape &output_shape, Scalar *output_data)

void	ConcatenationWithScaling (const ConcatenationParams &params, const Shape const input_shapes, const uint8_t const input_data, const Shape &output_shape, uint8_t *output_data)

template<typename T >
void	DepthToSpace (const Shape &unextended_input_shape, const T input_data, const Shape &unextended_output_shape, T output_data, int32_t block_size)

int	HowManyConvThreads (const Shape &output_shape, const Shape &filter_shape)

bool	MultithreadAlongBatches (int thread_count, int batches)

template<typename T , typename TS >
void	DepthwiseConv (const DepthwiseConvParams &params, const Shape &input_shape, const T input_data, const Shape &filter_shape, const T filter_data, const Shape &bias_shape, const TS bias_data, const Shape &output_shape, T output_data, ruy::Context *ruy_context)

void	DepthwiseConvOp (const DepthwiseConvParams &params, const Shape &input_shape, const float input_data, const Shape &filter_shape, const float filter_data, const Shape &bias_shape, const float bias_data, float padded_filter_data, bool pad_filter, float filter_buffers_data, const Shape &output_shape, float output_data)

void	Dequantize (const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, float output_data, const float scale, const int32_t zero_point)

void	Dequantize (const Shape &input_shape, const int8_t input_data, const Shape &output_shape, float output_data, const float scale, const int32_t zero_point)

void	Dequantize (const Shape &input_shape, const int16_t input_data, const Shape &output_shape, float output_data, const float scale, const int32_t zero_point)

void	Sin (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Cos (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Abs (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Rsqrt (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T >
void	Neg (const Shape &input_shape, const T input_data, const Shape &output_shape, T output_data)

void	Log (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Floor (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Sqrt (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Square (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ELU (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Erf (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	Exp (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T >
void	Fill (const T value_data, const Shape &output_shape, T output_data)

template<typename T >
void	FloorDivBroadcast (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	FloorDivElementwise (const Shape &shape, const T input1_data, const T input2_data, T *output_data)

template<typename T >
void	FloorModBroadcast (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	FloorModElementwise (const Shape &shape, const T input1_data, const T input2_data, T *output_data)

void	FullyConnected (const FullyConnectedParams &params, const Shape &input_shape, const float input_data, const Shape &weights_shape, const float weights_data, const Shape &, const float bias_data, const Shape &, float output_data)

void	FullyConnected (const FullyConnectedParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &filter_shape, const uint8_t filter_data, const Shape &bias_shape, const int32_t bias_data, const Shape &output_shape, uint8_t output_data)

void	FullyConnectedHybrid (const FullyConnectedParams &params, const Shape &input_shape, const float input_data, const Shape &filter_shape, const int8_t filter_data, const Shape &, const float bias_data, const Shape &output_shape, float output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)

void	FullyConnectedSparseWeightRandom (const FullyConnectedParams &params, const Shape &input_shape, const float input_data, const Shape &weights_shape, const float weights_data, const Shape &bias_shape, const float bias_data, const Shape &output_shape, float output_data, const uint16_t w1_segments, const uint16_t w1_indices)

void	FullyConnectedSparseWeight16x1 (const FullyConnectedParams &params, const Shape &input_shape, const float input_data, const Shape &weights_shape, const float weights_data, const Shape &bias_shape, const float bias_data, const Shape &output_shape, float output_data, const uint16_t w1_segments, const uint16_t w1_indices)

template<typename T , typename CoordsT = int32_t>
void	Gather (const GatherParams &op_params, const Shape &input_shape, const T input_data, const Shape &coords_shape, const CoordsT coords_data, const Shape &, T *output_data)

void	ComputeBatchIndices (const int32_t output_batch_size, const std::vector< int32_t > &reshape, const std::vector< int32_t > &bcast, std::vector< int32_t > *out_indices)

template<typename DSizes >
Eigen::DSizes< Index32, DSizes::count >	To32BitDims (const DSizes &in)

template<typename TensorType >
TTypes< typenameTensorType::Scalar, TensorType::NumIndices >::Tensor32Bit	To32Bit (TensorType in)

void	InstanceNorm (const InstanceNormParams &params, const Shape &input_shape, const float input_data, const Shape &gamma_shape, const float gamma_data, const Shape &beta_shape, const float beta_data, const Shape &output_shape, float output_data)

void	L2NormalizeFloat32 (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	L2NormalizeQuant8 (L2NormParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

void	LeakyReLU (const LeakyReluParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T >
void	LogicalAndBroadcast (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	LogicalAndElementwise (const Shape &shape, const T input1_data, const T input2_data, T *output_data)

void	LogicalNot (const Shape &input_shape, const bool input_data, const Shape &output_shape, bool output_data)

template<typename T >
void	LogicalOrBroadcast (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	LogicalOrElementwise (const Shape &shape, const T input1_data, const T input2_data, T *output_data)

void	Logistic (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	LogSoftmax (const SoftmaxParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	LogSoftmax (const SoftmaxParams &params, float input_scale, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

void	CalculateLstmGateFloat (const float input, const float input_to_gate_weights, const float aux_input, const float aux_input_to_gate_weights, const float output_state, const float recurrent_to_gate_weights, const float cell_state, const float cell_to_gate_weights, const float layer_norm_coefficients, const float gate_bias, const int n_batch, const int n_input, const int n_aux_input, const int n_output, const int n_cell, const FusedActivationFunctionType activation, float *gate, const bool is_input_all_zeros, const bool is_aux_input_all_zeros)

void	UpdateLstmCellFloat (int n_batch, int n_cell, float cell_state, const float input_gate, float forget_gate, const float cell_gate, bool use_cifg, float clip)

void	CalculateLstmOutputFloat (int n_batch, int n_cell, int n_output, const float cell_state, const float output_gate, FusedActivationFunctionType activation, const float projection_weights, const float projection_bias, const float proj_clip, float output_state, float scratch)

void	LstmStepFloat (const float input_ptr, const float input_to_input_weights_ptr, const float input_to_forget_weights_ptr, const float input_to_cell_weights_ptr, const float input_to_output_weights_ptr, const float aux_input_ptr, const float aux_input_to_input_weights_ptr, const float aux_input_to_forget_weights_ptr, const float aux_input_to_cell_weights_ptr, const float aux_input_to_output_weights_ptr, const float recurrent_to_input_weights_ptr, const float recurrent_to_forget_weights_ptr, const float recurrent_to_cell_weights_ptr, const float recurrent_to_output_weights_ptr, const float cell_to_input_weights_ptr, const float cell_to_forget_weights_ptr, const float cell_to_output_weights_ptr, const float input_layer_norm_coefficients_ptr, const float forget_layer_norm_coefficients_ptr, const float cell_layer_norm_coefficients_ptr, const float output_layer_norm_coefficients_ptr, const float input_gate_bias_ptr, const float forget_gate_bias_ptr, const float cell_gate_bias_ptr, const float output_gate_bias_ptr, const float projection_weights_ptr, const float projection_bias_ptr, const LSTMParams params, int n_batch, int n_cell, int n_input, int n_aux_input, int n_output, int output_batch_leading_dim, float output_state_ptr, float cell_state_ptr, float scratch0, float scratch1, float scratch2, float scratch3, float *output_ptr)

template<typename T >
void	MatrixBandPart (const T num_lower_diags, const T num_upper_diags, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T , typename Op >
void	MaximumMinimumBroadcast4DSlow (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data, Op op)

template<typename T >
void	Max (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	Min (const Shape &unextended_input1_shape, const T input1_data, const Shape &unextended_input2_shape, const T input2_data, const Shape &unextended_output_shape, T *output_data)

template<typename T >
void	MaxPool (const PoolParams &, const Shape &, const T , const Shape &, T )

template<>
void	MaxPool< float > (const PoolParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<>
void	MaxPool< uint8_t > (const PoolParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

template<typename T , typename TI >
void	OneHot (const int32_t depth, const T on_value, const T off_value, int32_t axis, const Shape &indices_shape, const TI indices_data, const Shape &, T output_data)

template<typename Scalar >
void	Pack (const PackParams &params, const Scalar const input_data, const Shape &output_shape, Scalar *output_data)

template<typename T >
void	Pad (const int32_t padding_data, int32_t pad_rank, const Shape &input_shape, const T input_data, const Shape &output_shape, T output_data, const T constant_value_data)

template<typename T >
void	powImpl (const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

template<typename InputT , typename OutputT >
void	Quantize (const Shape &input_shape, const InputT input_data, const Shape &output_shape, OutputT output_data, const float output_scale, const int32_t output_offset)

template<>
void	Quantize (const Shape &input_shape, const float input_data, const Shape &output_shape, int8_t output_data, const float scale, const int32_t zero_point)

template<>
void	Quantize (const Shape &input_shape, const float input_data, const Shape &output_shape, uint8_t output_data, const float scale, const int32_t zero_point)

template<>
void	Quantize (const Shape &input_shape, const float input_data, const Shape &output_shape, int16_t output_data, const float scale, const int32_t zero_point)

void	Quantize (const int32_t multiplier, const int32_t shift, int32_t channel_size, int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max, int32_t scratch, int8_t output)

template<typename input_type , typename output_type >
void	Requantize (const input_type input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, output_type output_data)

template<>
void	Requantize< uint8_t, int8_t > (const uint8_t input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, int8_t output_data)

template<>
void	Requantize< int8_t, uint8_t > (const int8_t input_data, int32_t size, int32_t effective_scale_multiplier, int32_t effective_scale_shift, int32_t input_zeropoint, int32_t output_zeropoint, uint8_t output_data)

template<typename T >
int	GetSize (T start, T limit, T delta)

template<typename T >
void	Range (const T start_data, const T limit_data, const T delta_data, T output_data)

template<typename In , typename Out >
bool	ReduceImpl (const In input_data, const Shape &input_shape, const Shape &, const int axis, const int num_axis, int input_iter, Out reducer(const Out current, const In in), Out output_data)

bool	ResolveAxis (const int num_dims, const std::vector< int > &axes, int out_axis, int out_num_axis)

template<typename T >
bool	InitTensorDataForReduce (const Shape &shape, const T init_value, T *data)

float	round_nearest (float value)

template<typename Out , typename In >
Out	mean_reducer (const Out data1, const In data2, int normalizer)

template<typename In >
int	sum_reducer (const int data1, const In data2)

template<typename In , typename Out >
bool	ReduceMeanImpl (const In input_data, const Shape &input_shape, const int axis, const int num_axis, int input_iter, Out reducer(const Out current, const In in, int normalizer), Out output_data)

template<typename In >
size_t	ReduceSumQuantImpl (const In input_data, const Shape &input_shape, const int axis, const int num_axis, int input_iter, int reducer(const int current, const In in), int temp_sum)

template<typename In , typename Out >
void	Mean (const Shape &input_shape, const In input_data, const Shape &output_shape, Out output_data, const std::vector< int > &axes)

template<typename In , typename Out >
void	MeanQ8Asymm (const Shape &input_shape, const In input_data, float input_scale, int32_t input_offset, const Shape &output_shape, Out output_data, float output_scale, int32_t output_offset, const std::vector< int > &axes)

template<typename In , typename Out >
void	MeanAxis1And2 (const Shape &input_shape, const In input_data, const Shape &output_shape, Out output_data)

void	ReLU (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ReLU6 (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ResizeBilinearKernel2x2 (int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x, int32_t y, int32_t depth, int32_t batch, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ResizeBilinear2x2 (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ResizeBilinearKernel (const float input_ptr, int32_t depth, float scale, float output_ptr)

void	ComputeInterpolationValues (const float value, const float scale, const bool half_pixel_centers, int32_t input_size, float scaled_value, int32_t lower_bound, int32_t *upper_bound)

void	ResizeBilinearGeneric (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, float height_scale, float width_scale, const Shape &input_shape, const float input_data, float output_data, const bool half_pixel_centers)

template<typename T >
void	ResizeBilinearGenericSmallChannel (int32_t batches, int32_t input_height, int32_t input_width, int32_t depth, int32_t output_height, int32_t output_width, float height_scale, float width_scale, const Shape &input_shape, const T input_data, T output_data, const bool half_pixel_centers)

void	ResizeBilinear (ResizeBilinearParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

void	ResizeBilinear (ResizeBilinearParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &output_shape, uint8_t output_data)

void	ComputeInterpolationValues (const int32_t value, const int32_t scale_10, const bool half_pixel_centers, int32_t input_size, int32_t scaled_value, int32_t lower_bound, int32_t *upper_bound)

void	ResizeBilinear (const ResizeBilinearParams &op_params, const Shape &unextended_input_shape, const int8_t input_data, const Shape &unextended_output_shape, int8_t output_data)

template<typename Scalar >
void	Reverse (int axis, const Shape &input_shape, const Scalar input_data, const Shape &, Scalar output_data)

void	RmsNorm (const RmsNormParams &params, const Shape &input_shape, const float input_data, const Shape &gamma_shape, const float gamma_data, const Shape &output_shape, float *output_data)

template<typename T >
void	RoPE (const RoPEMode mode, const Shape &input_shape, const T input_data, const Shape &sin_table_shape, const T sin_table_data, const Shape &cos_table_shape, const T cos_table_data, const Shape &output_shape, T output_data)

float	RoundToNearest (float value)

void	Round (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename D , typename T >
void	Select (const Shape &input_condition_shape, const D input_condition_data, const Shape &input_x_shape, const T input_x_data, const Shape &input_y_shape, const T input_y_data, const Shape &output_shape, T output_data)

template<typename D , typename T >
void	RankOneSelect (const Shape &input_condition_shape, const D input_condition_data, const Shape &input_x_shape, const T input_x_data, const Shape &input_y_shape, const T input_y_data, const Shape &output_shape, T output_data)

template<typename D , typename T >
void	BroadcastSelect4DSlow (const Shape &input_condition_shape, const D input_condition_data, const Shape &input_x_shape, const T input_x_data, const Shape &input_y_shape, const T input_y_data, const Shape &output_shape, T output_data)

template<typename T >
void	Slice (const SliceParams &op_params, const Shape &input_shape, SequentialTensorWriter< T > *writer)

template<typename T >
void	Slice (const SliceParams &op_params, const Shape &input_shape, const T input_data, T output_data)

void	Softmax (const float in, const int input_size, const int batch_size, const float beta, float out)

void	Softmax (const SoftmaxParams &params, const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T >
int32_t	QuantizeSoftmaxOutput (float prob_rescaled, int32_t zero_point)

template<>
int32_t	QuantizeSoftmaxOutput< uint8_t > (float prob_rescaled, int32_t)

void	PopulateSoftmaxLookupTable (float *table, float input_scale, float beta)

template<typename In , typename Out >
void	Softmax (const SoftmaxParams &params, const Shape &input_shape, const In input_data, const Shape &output_shape, Out output_data)

template<typename T >
void	SpaceToBatchND (const SpaceToBatchParams &params, const Shape &unextended_input_shape, const T input_data, const Shape &unextended_block_shape_shape, const int32_t block_shape_data, const Shape &unextended_padding_shape, const int32_t paddings_data, const Shape &unextended_output_shape, T output_data)

template<typename T >
void	SpaceToDepth (const SpaceToDepthParams &params, const Shape &unextended_input_shape, const T input_data, const Shape &unextended_output_shape, T output_data)

template<typename Scalar >
void	Split (const SplitParams &params, const Shape &input_shape, const Scalar input_data, const Shape &output_shape, Scalar const *output_data)

template<typename Scalar >
void	SplitV (const SplitVParams &params, const Shape &input_shape, const Scalar input_data, std::vector< nnfw::cker::Shape > &output_shapes, Scalar const *output_data)

template<typename T , int N>
void	SqDiffImpl (const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T output_data, NdArrayDesc< N > desc1_in, NdArrayDesc< N > desc2_in, NdArrayDesc< N > desc_out)

template<typename T >
void	SqDiff (const Shape &input1_shape, const T input1_data, const Shape &input2_shape, const T input2_data, const Shape &output_shape, T *output_data)

void	GenerateKey (Tensor seed, random::PhiloxRandom::Key out_key, random::PhiloxRandom::ResultType out_counter)

template<typename Device , class Distribution >
void	Fill (random::PhiloxRandom random, Tensor *output)

void	StatelessRandomUniform (const Shape &shape_shape, const int32_t shape_data, const Shape &seed_shape, const int32_t seed_data, const Shape &output_shape, float *output_data)

int	Clamp (const int v, const int lo, const int hi)

void	StridedSlicePadIndices (StridedSliceParams *p, int dim_count)

int	StartForAxis (const StridedSliceParams &params, const Shape &input_shape, int axis)

int	StopForAxis (const StridedSliceParams &params, const Shape &input_shape, int axis, int start_for_axis)

bool	LoopCondition (int index, int stop, int stride)

template<typename T >
StridedSliceParams	buildStridedSliceParams (const T begin, const T end, const T *strides, const uint32_t begin_mask, const uint32_t end_mask, const uint32_t shrink_axis_mask, const uint8_t rank)

void	checkOutputSize (const StridedSliceParams &op_params, const Shape &input_shape, const Shape &output_shape, uint32_t rank)

template<typename T >
void	StridedSlice (const StridedSliceParams &op_params, const Shape &unextended_input_shape, const T input_data, const Shape &unextended_output_shape, T output_data)

void	Tanh (const Shape &input_shape, const float input_data, const Shape &output_shape, float output_data)

template<typename T , typename M >
void	CopyMultipleTimes (const T in_data, int32_t in_size, M multiplier, T out_data)

template<typename T , typename M >
std::pair< int, int >	TileOneDimension (const Shape &in_dimensions, const T in_data, const M multipliers, T *out_data, int dimension)

template<typename T >
void	Transpose2D (const Shape &input_shape, const T input_data, const Shape &output_shape, T output_data)

template<typename T >
void	Transpose3D (const TransposeParams &params, const Shape &input_shape, const T input_data, const Shape &, T output_data)

template<typename T >
void	TransposeImpl (const TransposeParams &params, const Shape &input_shape, const T input_data, const Shape &output_shape, T output_data)

template<typename T >
void	Transpose (const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape, const T input_data, const Shape &unshrunk_output_shape, T output_data)

void	TransposeConv (const TransposeConvParams &params, const Shape &input_shape, const float input_data, const Shape &filter_shape, const float filter_data, const Shape &output_shape, float *output_data)

template<typename Scalar >
void	Unpack (const UnpackParams &params, const Shape &input_shape, const Scalar input_data, const Shape &output_shape, Scalar const *output_datas)

template<typename T >
void	PortableCwiseClipping (T *vector, const int v_size, const T clipping_value)

void	PortableVectorBatchVectorAssign (const float vector, int v_size, int n_batch, float batch_vector)

void	PortableVectorBatchVectorAdd (const float vector, int v_size, int n_batch, float batch_vector)

bool	PortableIsZeroVector (const float *vector, int v_size)

void	PortableApplyActivationToVector (const float vector, int v_size, FusedActivationFunctionType activation, float result)

void	PortableSub1Vector (const float vector, int v_size, float result)

void	PortableSymmetricQuantizeFloats (const float values, const int size, int8_t quantized_values, float min_value, float max_value, float *scaling_factor)

void	PortableAsymmetricQuantizeFloats (const float values, const int size, int8_t quantized_values, float scaling_factor, int32_t offset)

void	PortableMatrixBatchVectorMultiplyAccumulate (const int8_t __restrict__ matrix, const int m_rows, const int m_cols, const int8_t __restrict__ vectors, const float scaling_factors, int n_batch, float __restrict__ result, int result_stride)

void	PortableMatrixBatchVectorMultiplyAccumulate (const int8_t __restrict__ matrix, const int m_rows, const int m_cols, const int8_t __restrict__ vector, const float scaling_factors, int n_batch, int32_t , float __restrict__ result, int result_stride, ruy::Context )

void	PortableMatrixBatchVectorMultiplyAccumulate (const float matrix, int m_rows, int m_cols, const float vector, int n_batch, float *result, int result_stride)

void	PortableMeanStddevNormalization (const float input_vector, float output_vector, int v_size, int n_batch)

void	PortableZeroVector (float *vector, int v_size)

int	MatchingDim (const Shape &shape1, int index1, const Shape &shape2, int index2)

template<typename... Args>
int	MatchingDim (const Shape &shape1, int index1, const Shape &shape2, int index2, Args... args)

Shape	GetShape (const std::vector< int32_t > &data)

int	Offset (const Shape &shape, int i0, int i1, int i2, int i3)

int	Offset (const Shape &shape, int *index)

int	FlatSizeSkipDim (const Shape &shape, int skip_dim)

template<typename... Ts>
bool	checkMatching (const Shape &shape, Ts... check_shapes)

template<typename... Ts>
int	MatchingFlatSize (const Shape &shape, Ts... check_shapes)

int	MatchingFlatSizeSkipDim (const Shape &shape, int skip_dim, const Shape &check_shape_0)

int	MatchingFlatSizeSkipDim (const Shape &shape, int skip_dim, const Shape &check_shape_0, const Shape &check_shape_1)

int	MatchingElementsSize (const Shape &shape, const Shape &check_shape_0, const Shape &check_shape_1)

ShapeIterator	begin (const Shape &s)

ShapeIterator	end (const Shape &s)

void	CwiseClipping (float *vector, const int v_size, const float clipping_value)

void	VectorBatchVectorAdd (const float vector, int v_size, int n_batch, float batch_vector)

void	VectorBatchVectorAssign (const float vector, int v_size, int n_batch, float batch_vector)

template<typename T >
void	VectorVectorCwiseProduct (const T __restrict__ vector1, const T __restrict__ vector2, int v_size, T *__restrict__ result)

template<typename T >
void	VectorVectorCwiseProductAccumulate (const T __restrict__ vector1, const T __restrict__ vector2, int v_size, T *__restrict__ result)

template<typename T >
void	VectorBatchVectorCwiseProduct (const T vector, int v_size, const T batch_vector, int n_batch, T *result)

template<typename T >
void	VectorBatchVectorCwiseProductAccumulate (const T vector, int v_size, const T batch_vector, int n_batch, T *result)

bool	IsZeroVector (const float *vector, int v_size)

void	ApplyActivationToVector (const float vector, int v_size, FusedActivationFunctionType activation, float result)

void	Sub1Vector (const float vector, int v_size, float result)

void	SymmetricQuantizeFloats (const float values, const int size, int8_t quantized_values, float min, float max, float *scaling_factor)

void	MatrixBatchVectorMultiplyAccumulate (const int8_t matrix, const int m_rows, const int m_cols, const int8_t vector, const float scaling_factors, int n_batch, float result, int result_stride)

void	MatrixBatchVectorMultiplyAccumulate (const float matrix, int m_rows, int m_cols, const float vector, int n_batch, float *result, int result_stride)

void	MatrixBatchVectorMultiplyAccumulate (const int8_t matrix, const int m_rows, const int m_cols, const int8_t vectors, const float scaling_factors, int n_batch, int32_t scratch, float result, int result_stride, ruy::Context ruy_context)

void	MeanStddevNormalization (const float input_vector, float output_vector, int v_size, int n_batch)

void	ZeroVector (float *vector, int v_size)

template<typename AccumScalar , typename DstScalar , QuantizationFlavor quantization_flavor>
void	ValidateGemmParams (const GemmParams< AccumScalar, DstScalar, quantization_flavor > &params)

template<typename T >
T	ActivationFunctionWithMinMax (T x, T output_activation_min, T output_activation_max)

void	QuantizeMultiplier (double double_multiplier, int32_t quantized_multiplier, int shift)

void	QuantizeMultiplierSmallerThanOneExp (double double_multiplier, int32_t quantized_multiplier, int left_shift)

int32_t	MultiplyByQuantizedMultiplier (int32_t x, int32_t quantized_multiplier, int shift)

int32_t	MultiplyByQuantizedMultiplierGreaterThanOne (int32_t x, int32_t quantized_multiplier, int left_shift)

int32_t	MultiplyByQuantizedMultiplierSmallerThanOneExp (int32_t x, int32_t quantized_multiplier, int left_shift)

int	NodeOffset (int b, int h, int w, int height, int width)

int	CountLeadingZeros (uint32_t integer_input)

void	GetInvSqrtQuantizedMultiplierExp (int32_t input, int reverse_shift, int32_t output_inv_sqrt, int output_shift)

int	SubscriptToIndex (const NdArrayDesc< 4 > &desc, int i0, int i1, int i2, int i3)

template<int N>
int	SubscriptToIndexGeneric (const NdArrayDesc< N > desc, int iter)

template<int N>
void	CopyDimsToDesc (const Shape &input_shape, NdArrayDesc< N > *desc_out)

template<int N>
void	NdArrayDescsForElementwiseBroadcast (const Shape &input0_shape, const Shape &input1_shape, NdArrayDesc< N > desc0_out, NdArrayDesc< N > desc1_out)

template<int N>
void	NdArrayDescsForElementwiseBroadcast (const Shape &input0_shape, const Shape &input1_shape, const Shape &input2_shape, NdArrayDesc< N > desc0_out, NdArrayDesc< N > desc1_out, NdArrayDesc< N > *desc2_out)

bool	NextIndex (const int num_dims, const int dims, int current)

size_t	ReducedOutputOffset (const int num_dims, const int dims, const int index, const int num_axis, const int *axis)

template<typename T >
void	optimized_ops_preload_l1_keep (const T *ptr)

std::ostream &	operator<< (std::ostream &os, const Shape &shape)

Typedef Documentation

◆ ComparisonFn

template<typename T >

using nnfw::cker::ComparisonFn = typedef bool (*)(T, T)

Definition at line 37 of file Comparison.h.

◆ CPUDevice

typedef Eigen::ThreadPoolDevice nnfw::cker::CPUDevice

Definition at line 51 of file RandomOpCpu.h.

◆ Index32

typedef TTypes<float,1>::Tensor32Bit::Index nnfw::cker::Index32

Definition at line 86 of file Tensor.h.

◆ LabelCounts

using nnfw::cker::LabelCounts = typedef std::vector<int32_t>

Definition at line 109 of file Einsum.h.

◆ Labels

using nnfw::cker::Labels = typedef std::vector<int32_t>

Definition at line 107 of file Einsum.h.

◆ LabelToDimSizes

using nnfw::cker::LabelToDimSizes = typedef std::vector<int32_t>

Definition at line 111 of file Einsum.h.

◆ MatrixMap

template<typename Scalar >

using nnfw::cker::MatrixMap = typedef typename std::conditional< std::is_const<Scalar>::value, Eigen::Map< const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic> >, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> >>::type

Definition at line 53 of file Utils.h.

◆ OperandLabelCounts

using nnfw::cker::OperandLabelCounts = typedef std::vector<LabelCounts>

Definition at line 110 of file Einsum.h.

◆ OperandLabels

using nnfw::cker::OperandLabels = typedef std::vector<Labels>

Definition at line 108 of file Einsum.h.

◆ ShapeVec

using nnfw::cker::ShapeVec = typedef std::vector<int32_t>

Definition at line 106 of file Einsum.h.

◆ VectorMap

template<typename Scalar >

using nnfw::cker::VectorMap = typedef typename std::conditional< std::is_const<Scalar>::value, Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1> >, Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1> >>::type

Definition at line 38 of file Utils.h.

Enumeration Type Documentation

◆ BinaryArithmeticOpType

enum class nnfw::cker::BinaryArithmeticOpType

strong

Enumerator
ADD
SUB
MUL
DIV
POW

Definition at line 47 of file Types.h.

{
  ADD = 0,
  SUB = 1,
  MUL = 2,
  DIV = 3,
  POW = 4,
};

◆ BroadcastableOpCategory

enum class nnfw::cker::BroadcastableOpCategory : uint8_t

strong

Enumerator
kNone
kNonBroadcast
kFirstInputBroadcastsFast
kSecondInputBroadcastsFast
kGenericBroadcast

Definition at line 78 of file Types.h.

{
  kNone,
  kNonBroadcast,              // Matching input shapes.
  kFirstInputBroadcastsFast,  // Fivefold nested loops.
  kSecondInputBroadcastsFast, // Fivefold nested loops.
  kGenericBroadcast,          // Fall-back.
};

◆ CachePolicy

enum class nnfw::cker::CachePolicy : std::uint8_t

strong

Enumerator
kNeverCache
kCacheIfLargeSpeedup
kAlwaysCache

Definition at line 425 of file Types.h.

{
  kNeverCache,
  kCacheIfLargeSpeedup,
  kAlwaysCache,
};

◆ ComparisonOpType

enum class nnfw::cker::ComparisonOpType

strong

Enumerator
Equal
NotEqual
Greater
GreaterEqual
Less
LessEqual

Definition at line 56 of file Types.h.

{
  Equal,
  NotEqual,
  Greater,
  GreaterEqual,
  Less,
  LessEqual
};

◆ DimensionType

enum nnfw::cker::DimensionType

Enumerator
kBroadcasting
kBatch
kFree
kContract
kReduce

Definition at line 116 of file Einsum.h.

{
  // Batch dimensions are those present in two inputs as well as the output.
  // They are part of the batch dimensions during Tensor contraction.
  // Such dimensions may be broadcasting dimensions (those mapping to
  // ellipsis)
  // or explicit batch dimensions corresponding to named axis labels.
  kBroadcasting = 0,
  kBatch = 1,
  // Free dimensions are present in exactly one of the inputs, and also the
  // output. These are non-contracted axes in the Tensor contraction.
  kFree = 2,
  // Contract dimensions are present in two inputs, but not the output. These
  // dimensions are contracted in Tensor contraction.
  kContract = 3,
  // Reduce dimensions are present in exactly one input; and not in the output
  // and are summed over prior to Tensor contraction.
  kReduce = 4,
};

◆ FusedActivationFunctionType

enum class nnfw::cker::FusedActivationFunctionType

strong

Enumerator
kNone
kRelu6
kRelu1
kRelu
kTanh
kSigmoid

Definition at line 31 of file Types.h.

{
  kNone = 0,
  kRelu6 = 1,
  kRelu1 = 2,
  kRelu = 3,
  kTanh = 4,
  kSigmoid = 6,
};

◆ LSTMKernelType

enum nnfw::cker::LSTMKernelType

Enumerator
kTfLiteLSTMFullKernel
kTfLiteLSTMBasicKernel

Definition at line 284 of file Types.h.

{
  kTfLiteLSTMFullKernel = 0,
  kTfLiteLSTMBasicKernel
};

◆ Order

enum class nnfw::cker::Order

strong

Enumerator
kColMajor
kRowMajor

Definition at line 419 of file Types.h.

{
  kColMajor,
  kRowMajor
};

◆ PaddingType

enum class nnfw::cker::PaddingType

strong

Enumerator
kNone
kSame
kValid

Definition at line 40 of file Types.h.

{
  kNone = 0,
  kSame = 1,
  kValid = 2,
};

◆ QuantizationFlavor

enum class nnfw::cker::QuantizationFlavor

strong

Enumerator
kFloatingPoint
kIntegerWithUniformMultiplier
kIntegerWithPerRowMultiplier

Definition at line 474 of file Types.h.

{
  // Floating-point Gemm: the accumulators are not multiplied by any
  // 'multiplier'.
  kFloatingPoint,
  // Quantized Gemm using a single multiplier for all accumulators.
  kIntegerWithUniformMultiplier,
  // Quantized Gemm using a separate multipliers for accumulators of each
  // row of the destination matrix. This is what is called 'per-channel'
  // in GemmParams. Here we use the more specific 'per-row' terminology
  // to allow for the possibility of 'per-column' in the future, and to
  // allow for that to be a separate code path in some back-end such as
  // gemmlowp.
  kIntegerWithPerRowMultiplier
};

◆ RoPEMode

enum class nnfw::cker::RoPEMode

strong

Enumerator
kGptNeox
kGptJ

Definition at line 66 of file Types.h.

{
  kGptNeox = 0,
  kGptJ = 1,
};

Function Documentation

◆ Abs()

void nnfw::cker::Abs	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 51 of file Elementwise.h.

{
  auto input_map = MapAsVector(input_data, input_shape);
  auto output_map = MapAsVector(output_data, output_shape);
  output_map.array() = input_map.array().abs();
}

References MapAsVector(), and output_shape.

◆ ActivationFunctionWithMinMax()

template<typename T >

T nnfw::cker::ActivationFunctionWithMinMax	(	T	x,
		T	output_activation_min,
		T	output_activation_max
	)

inline

Definition at line 43 of file Utils.h.

{
  return std::min<T>(std::max<T>(x, output_activation_min), output_activation_max);
}

Referenced by AveragePool< float >(), BiasAndClamp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::Conv(), nnfw::cker::reference_integer_ops::DepthwiseConvHybridPerChannel(), nnfw::cker::reference::HybridConvPerChannel(), InstanceNorm(), and MaxPool< float >().

◆ AddN()

template<typename T >

void nnfw::cker::AddN	(	const Shape &	input_shape,
		const size_t	num_inputs,
		const T **	input_data,
		T *	output_data
	)

Definition at line 29 of file AddN.h.

{
  const size_t size = input_shape.FlatSize();
  for (size_t i = 0; i < size; ++i)
  {
    T x = 0;
    for (size_t j = 0; j < num_inputs; ++j)
    {
      x += input_data[j][i];
    }
    output_data[i] = x;
  }
}

References nnfw::cker::Shape::FlatSize(), and size.

◆ ApplyActivationToVector()

void nnfw::cker::ApplyActivationToVector	(	const float *	vector,
		int	v_size,
		FusedActivationFunctionType	activation,
		float *	result
	)

inline

Definition at line 109 of file TensorUtils.h.

{
  PortableApplyActivationToVector(vector, v_size, activation, result);
}

References PortableApplyActivationToVector().

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().

◆ ArgMinMax()

template<typename T1 , typename T2 , typename Cmp >

void nnfw::cker::ArgMinMax	(	const Shape &	input1_shape,
		const T1 *	input1_data,
		const Shape &	output_shape,
		T2 *	output_data,
		int32_t	axis,
		const Cmp &	cmp
	)

Definition at line 29 of file ArgMinMax.h.

{
  assert(input1_shape.DimensionsCount() > 0);
  assert(input1_shape.DimensionsCount() - 1 == output_shape.DimensionsCount());
  if (axis < 0)
  {
    axis += input1_shape.DimensionsCount();
  }
  const int axis_size = input1_shape.Dims(axis);
 
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    assert(input1_shape.Dims(i) == output_shape.Dims(i));
    outer_size *= input1_shape.Dims(i);
  }
 
  int inner_size = 1;
  const int dims_count = input1_shape.DimensionsCount();
  for (int i = axis + 1; i < dims_count; ++i)
  {
    assert(input1_shape.Dims(i) == output_shape.Dims(i - 1));
    inner_size *= input1_shape.Dims(i);
  }
  for (int outer = 0; outer < outer_size; ++outer)
  {
    for (int inner = 0; inner < inner_size; ++inner)
    {
      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
      T2 min_max_index = 0;
      for (int i = 1; i < axis_size; ++i)
      {
        const auto &curr_value = input1_data[(outer * axis_size + i) * inner_size + inner];
        if (cmp(curr_value, min_max_value))
        {
          min_max_value = curr_value;
          min_max_index = static_cast<T2>(i);
        }
      }
      output_data[outer * inner_size + inner] = min_max_index;
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ AveragePool()

template<typename T >

void nnfw::cker::AveragePool	(	const PoolParams &	,
		const Shape &	,
		const T *	,
		const Shape &	,
		T *
	)

Definition at line 36 of file AveragePool.h.

{
  static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
                "cker::MaxPool : This function supports only integer or floating point");
  throw std::runtime_error("cker::AveragePool : Unsupported data type");
}

◆ AveragePool16()

void nnfw::cker::AveragePool16	(	const PoolParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

inline

Definition at line 106 of file AveragePool.h.

{
  // Here, and in other pooling ops, in order to maintain locality of reference,
  // to minimize some recalculations, and to load into NEON vector registers, we
  // use an inner loop down the depth. Since depths can be large and hence we
  // would need arbitrarily large temporary storage, we divide the work up into
  // depth tranches just within the batch loop.
  static constexpr int kPoolingAccTrancheSize = 256;
 
  assert(params.quantized_activation_min <= params.quantized_activation_max);
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  uint16_t acc[kPoolingAccTrancheSize];
  for (int batch = 0; batch < batches; ++batch)
  {
    // We proceed through the depth in tranches (see comment above). The
    // depth_base is the depth at the beginning of the tranche. The
    // tranche_depth is the depth dimension of the tranche.
    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
    {
      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
      for (int out_y = 0; out_y < output_height; ++out_y)
      {
        for (int out_x = 0; out_x < output_width; ++out_x)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          const int filter_count =
            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
          memset(acc, 0, tranche_depth * sizeof(acc[0]));
          const uint8_t *input_ptr =
            input_data + depth_base +
            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
          for (int fy = filter_y_start; fy < filter_y_end; fy++)
          {
            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
            for (int fx = filter_x_start; fx < filter_x_end; fx++)
            {
              const uint8_t *input_channel_ptr = input_row_ptr;
              int channel = 0;
#ifdef USE_NEON
              for (; channel <= tranche_depth - 16; channel += 16)
              {
                uint16x8_t acc_reg[2];
                for (int i = 0; i < 2; i++)
                {
                  acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
                }
                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
                input_channel_ptr += 16;
                acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
                acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
                for (int i = 0; i < 2; i++)
                {
                  vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
                }
              }
              for (; channel <= tranche_depth - 8; channel += 8)
              {
                uint16x8_t acc_reg = vld1q_u16(acc + channel);
                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
                input_channel_ptr += 8;
                acc_reg = vaddw_u8(acc_reg, input_reg);
                vst1q_u16(acc + channel, acc_reg);
              }
#endif
              for (; channel < tranche_depth; ++channel)
              {
                acc[channel] += *input_channel_ptr++;
              }
              input_row_ptr += depth;
            }
          }
          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
          int channel = 0;
#ifdef USE_NEON
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
  if (filter_count == FILTER_COUNT)                                     \
  {                                                                     \
    for (; channel <= tranche_depth - 8; channel += 8)                  \
    {                                                                   \
      uint16_t buf[8];                                                  \
      for (int i = 0; i < 8; i++)                                       \
      {                                                                 \
        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
      }                                                                 \
      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
      vst1_u8(output_ptr + channel, buf8);                              \
    }                                                                   \
  }
          AVGPOOL_DIVIDING_BY(9)
          AVGPOOL_DIVIDING_BY(15)
#undef AVGPOOL_DIVIDING_BY
          for (; channel <= tranche_depth - 8; channel += 8)
          {
            uint16_t buf[8];
            for (int i = 0; i < 8; i++)
            {
              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
            }
            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
            vst1_u8(output_ptr + channel, buf8);
          }
#endif
          for (; channel < tranche_depth; ++channel)
          {
            uint8_t a = (acc[channel] + filter_count / 2) / filter_count;
            a = std::max<uint16_t>(a, params.quantized_activation_min);
            a = std::min<uint16_t>(a, params.quantized_activation_max);
            output_ptr[channel] = static_cast<uint8_t>(a);
          }
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by AveragePool< uint8_t >().

◆ AveragePool32()

void nnfw::cker::AveragePool32	(	const PoolParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

inline

Definition at line 242 of file AveragePool.h.

{
 
  // Here, and in other pooling ops, in order to maintain locality of reference,
  // to minimize some recalculations, and to load into NEON vector registers, we
  // use an inner loop down the depth. Since depths can be large and hence we
  // would need arbitrarily large temporary storage, we divide the work up into
  // depth tranches just within the batch loop.
  static constexpr int kPoolingAccTrancheSize = 256;
 
  assert(params.quantized_activation_min <= params.quantized_activation_max);
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  uint32_t acc[kPoolingAccTrancheSize];
  for (int batch = 0; batch < batches; ++batch)
  {
    // We proceed through the depth in tranches (see comment above). The
    // depth_base is the depth at the beginning of the tranche. The
    // tranche_depth is the depth dimension of the tranche.
    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
    {
      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
      for (int out_y = 0; out_y < output_height; ++out_y)
      {
        for (int out_x = 0; out_x < output_width; ++out_x)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          const int filter_count =
            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
          memset(acc, 0, tranche_depth * sizeof(acc[0]));
          const uint8_t *input_ptr =
            input_data + depth_base +
            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
          for (int fy = filter_y_start; fy < filter_y_end; fy++)
          {
            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
            for (int fx = filter_x_start; fx < filter_x_end; fx++)
            {
              const uint8_t *input_channel_ptr = input_row_ptr;
              int channel = 0;
#ifdef USE_NEON
              for (; channel <= tranche_depth - 16; channel += 16)
              {
                uint16x4_t acc_reg[4];
                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
                input_channel_ptr += 16;
                acc_reg[0] = vget_low_u16(vmovl_u8(vget_low_u8(input_reg)));
                acc_reg[1] = vget_high_u16(vmovl_u8(vget_low_u8(input_reg)));
                acc_reg[2] = vget_low_u16(vmovl_u8(vget_high_u8(input_reg)));
                acc_reg[3] = vget_high_u16(vmovl_u8(vget_high_u8(input_reg)));
                for (int i = 0; i < 4; i++)
                {
                  vst1q_u32(acc + channel + 4 * i,
                            vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
                }
              }
              for (; channel <= tranche_depth - 8; channel += 8)
              {
                uint16x4_t acc_reg[2];
                uint16x8_t input_reg = vmovl_u8(vld1_u8(input_channel_ptr));
                input_channel_ptr += 8;
                acc_reg[0] = vget_low_u16(input_reg);
                acc_reg[1] = vget_high_u16(input_reg);
                for (int i = 0; i < 2; i++)
                {
                  vst1q_u32(acc + channel + 4 * i,
                            vaddw_u16(vld1q_u32(acc + channel + 4 * i), acc_reg[i]));
                }
              }
#endif
              for (; channel < tranche_depth; ++channel)
              {
                acc[channel] += *input_channel_ptr++;
              }
              input_row_ptr += depth;
            }
          }
          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
          int channel = 0;
#ifdef USE_NEON
#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
  if (filter_count == FILTER_COUNT)                                     \
  {                                                                     \
    for (; channel <= tranche_depth - 8; channel += 8)                  \
    {                                                                   \
      uint16_t buf[8];                                                  \
      for (int i = 0; i < 8; i++)                                       \
      {                                                                 \
        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
      }                                                                 \
      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                      \
      buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max)); \
      buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min)); \
      vst1_u8(output_ptr + channel, buf8);                              \
    }                                                                   \
  }
          AVGPOOL_DIVIDING_BY(9)
          AVGPOOL_DIVIDING_BY(15)
#undef AVGPOOL_DIVIDING_BY
          for (; channel <= tranche_depth - 8; channel += 8)
          {
            uint16_t buf[8];
            for (int i = 0; i < 8; i++)
            {
              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
            }
            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
            vst1_u8(output_ptr + channel, buf8);
          }
#endif
          for (; channel < tranche_depth; ++channel)
          {
            uint16_t a = (acc[channel] + filter_count / 2) / filter_count;
            a = std::max<uint16_t>(a, params.quantized_activation_min);
            a = std::min<uint16_t>(a, params.quantized_activation_max);
            output_ptr[channel] = static_cast<uint8_t>(a);
          }
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by AveragePool< uint8_t >().

◆ AveragePool< float >()

template<>

void nnfw::cker::AveragePool< float >	(	const PoolParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 44 of file AveragePool.h.

{
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  // TODO(benoitjacob) make this a proper reference impl without Eigen!
  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
  // TODO(benoitjacob) get rid of the dynamic memory allocation here!
  Eigen::VectorXf out_count(out_mat.cols());
  out_count.setZero();
  // Prefill the output to 0.
  out_mat.setZero();
  for (int b = 0; b < batches; ++b)
  {
    for (int h = 0; h < input_height; ++h)
    {
      for (int w = 0; w < input_width; ++w)
      {
        // (h_start, h_end) * (w_start, w_end) is the range that the input
        // vector projects to.
        int hpad = h + params.padding_values.height;
        int wpad = w + params.padding_values.width;
        int h_start =
          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
        int h_end = std::min(hpad / stride_height + 1, output_height);
        int w_start =
          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
        int w_end = std::min(wpad / stride_width + 1, output_width);
        // compute elementwise sum
        for (int ph = h_start; ph < h_end; ++ph)
        {
          for (int pw = w_start; pw < w_end; ++pw)
          {
            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
            out_mat.col(out_offset) += in_mat.col(NodeOffset(b, h, w, input_height, input_width));
            out_count(out_offset)++;
          }
        }
      }
    }
  }
  // Divide the output by the actual number of elements being averaged over
  assert(out_count.minCoeff() > 0);
  out_mat.array().rowwise() /= out_count.transpose().array();
 
  const int flat_size = output_shape.FlatSize();
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
                                                  params.float_activation_max);
  }
}

References ActivationFunctionWithMinMax(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PoolParams::float_activation_max, nnfw::cker::PoolParams::float_activation_min, nnfw::cker::PaddingValues::height, MapAsMatrixWithLastDimAsRows(), MatchingDim(), NodeOffset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ AveragePool< int8_t >()

template<>

void nnfw::cker::AveragePool< int8_t >	(	const PoolParams &	params,
		const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	output_shape,
		int8_t *	output_data
	)

Definition at line 399 of file AveragePool.h.

{
  // Here, and in other pooling ops, in order to maintain locality of reference,
  // to minimize some recalculations, and to load into NEON vector registers, we
  // use an inner loop down the depth. Since depths can be large and hence we
  // would need arbitrarily large temporary storage, we divide the work up into
  // depth tranches just within the batch loop.
  static constexpr int kPoolingAccTrancheSize = 256;
 
  assert(params.quantized_activation_min <= params.quantized_activation_max);
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  int32_t acc[kPoolingAccTrancheSize];
  for (int batch = 0; batch < batches; ++batch)
  {
    // We proceed through the depth in tranches (see comment above). The
    // depth_base is the depth at the beginning of the tranche. The
    // tranche_depth is the depth dimension of the tranche.
    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
    {
      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
      for (int out_y = 0; out_y < output_height; ++out_y)
      {
        for (int out_x = 0; out_x < output_width; ++out_x)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          const int filter_count =
            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
          memset(acc, 0, tranche_depth * sizeof(acc[0]));
          const int8_t *input_ptr =
            input_data + depth_base +
            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
          for (int fy = filter_y_start; fy < filter_y_end; fy++)
          {
            const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
            for (int fx = filter_x_start; fx < filter_x_end; fx++)
            {
              const int8_t *input_channel_ptr = input_row_ptr;
              int channel = 0;
#ifdef USE_NEON
              for (; channel <= tranche_depth - 16; channel += 16)
              {
                int16x4_t acc_reg[4];
                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
                input_channel_ptr += 16;
                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
                for (int i = 0; i < 4; i++)
                {
                  vst1q_s32(acc + channel + 4 * i,
                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
                }
              }
              for (; channel <= tranche_depth - 8; channel += 8)
              {
                int16x4_t acc_reg[2];
                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
                input_channel_ptr += 8;
                acc_reg[0] = vget_low_s16(input_reg);
                acc_reg[1] = vget_high_s16(input_reg);
                for (int i = 0; i < 2; i++)
                {
                  vst1q_s32(acc + channel + 4 * i,
                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
                }
              }
#endif
              for (; channel < tranche_depth; ++channel)
              {
                acc[channel] += *input_channel_ptr++;
              }
              input_row_ptr += depth;
            }
          }
          int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
          int channel = 0;
#ifdef USE_NEON
          for (; channel <= tranche_depth - 8; channel += 8)
          {
            int16_t buf[8];
            for (int i = 0; i < 8; i++)
            {
              buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
                                            : (acc[channel + i] - filter_count / 2) / filter_count;
            }
            int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
            buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
            buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
            vst1_s8(output_ptr + channel, buf8);
          }
#endif
          for (; channel < tranche_depth; ++channel)
          {
            int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
                                         : (acc[channel] - filter_count / 2) / filter_count;
            a = std::max<int16_t>(a, params.quantized_activation_min);
            a = std::min<int16_t>(a, params.quantized_activation_max);
            output_ptr[channel] = static_cast<int8_t>(a);
          }
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ AveragePool< uint8_t >()

template<>

void nnfw::cker::AveragePool< uint8_t >	(	const PoolParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

Definition at line 384 of file AveragePool.h.

{
  if (params.filter_height * params.filter_width > 16 * 16)
  {
    AveragePool32(params, input_shape, input_data, output_shape, output_data);
  }
  else
  {
    AveragePool16(params, input_shape, input_data, output_shape, output_data);
  }
}

References AveragePool16(), AveragePool32(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, and output_shape.

◆ BatchToSpaceND()

template<typename T >

void nnfw::cker::BatchToSpaceND	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const int32_t *	block_shape_data,
		const int32_t *	crops_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 50 of file BatchToSpaceND.h.

{
  auto input_dim = unextended_input1_shape.DimensionsCount();
  auto output_dim = unextended_output_shape.DimensionsCount();
 
  assert(input_dim == 3 || input_dim == 4);
  assert(input_dim == output_dim);
 
  UNUSED(input_dim);
  UNUSED(output_dim);
 
  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
  auto extend_shape = [](const Shape &shape) {
    if (shape.DimensionsCount() == 4)
    {
      return shape;
    }
    Shape new_shape(4, 1);
    new_shape.SetDim(0, shape.Dims(0));
    new_shape.SetDim(1, shape.Dims(1));
    new_shape.SetDim(3, shape.Dims(2));
    return new_shape;
  };
  const Shape input1_shape = extend_shape(unextended_input1_shape);
  const Shape output_shape = extend_shape(unextended_output_shape);
 
  const int32_t output_width = output_shape.Dims(2);
  const int32_t output_height = output_shape.Dims(1);
  const int32_t output_batch_size = output_shape.Dims(0);
 
  const int32_t depth = input1_shape.Dims(3);
  const int32_t input_width = input1_shape.Dims(2);
  const int32_t input_height = input1_shape.Dims(1);
  const int32_t input_batch_size = input1_shape.Dims(0);
 
  const int32_t block_shape_height = block_shape_data[0];
  const int32_t block_shape_width = block_shape_data[1];
 
  const int32_t crops_top = crops_data[0];
  const int32_t crops_left = crops_data[2];
 
  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
  {
    const int out_batch = in_batch % output_batch_size;
    const int spatial_offset = in_batch / output_batch_size;
 
    int in_h_start = 0;
    int in_h_end = 0;
    // GetIndexRange ensures start and end indices are in [0, output_height).
    GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height,
                  output_height, &in_h_start, &in_h_end);
 
    for (int in_h = in_h_start; in_h < in_h_end; ++in_h)
    {
      const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
      assert(out_h >= 0);
      assert(out_h < output_height);
 
      int in_w_start = 0;
      int in_w_end = 0;
      // GetIndexRange ensures start and end indices are in [0, output_width).
      GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width,
                    output_width, &in_w_start, &in_w_end);
 
      for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
      {
        const int out_w =
          in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
        assert(out_w >= 0);
        assert(out_w < output_width);
        T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
        const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
        memcpy(out, in, depth * sizeof(T));
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), GetIndexRange(), Offset(), output_shape, nnfw::cker::Shape::SetDim(), and UNUSED.

◆ begin()

ShapeIterator nnfw::cker::begin ( const Shape & s )

inline

Definition at line 88 of file ShapeIterator.h.

88{ return ShapeIterator(s); }

nnfw::cker::ShapeIterator

Definition ShapeIterator.h:28

◆ BiasAndClamp()

void nnfw::cker::BiasAndClamp	(	float	clamp_min,
		float	clamp_max,
		int	bias_size,
		const float *	bias_data,
		int	array_size,
		float *	array_data
	)

inline

Definition at line 29 of file Common.h.

{
  // Note: see b/132215220: in May 2019 we thought it would be OK to replace
  // this with the Eigen one-liner:
  //   return (array.colwise() + bias).cwiseMin(clamp_max).cwiseMin(clamp_max).
  // This turned out to severely regress performance: +4ms (i.e. 8%) on
  // MobileNet v2 / 1.0 / 224. So we keep custom NEON code for now.
  assert((array_size % bias_size) == 0);
#ifdef USE_NEON
  float *array_ptr = array_data;
  float *array_end_ptr = array_ptr + array_size;
  const auto clamp_min_vec = vdupq_n_f32(clamp_min);
  const auto clamp_max_vec = vdupq_n_f32(clamp_max);
  for (; array_ptr != array_end_ptr; array_ptr += bias_size)
  {
    int i = 0;
    for (; i <= bias_size - 16; i += 16)
    {
      auto b0 = vld1q_f32(bias_data + i);
      auto b1 = vld1q_f32(bias_data + i + 4);
      auto b2 = vld1q_f32(bias_data + i + 8);
      auto b3 = vld1q_f32(bias_data + i + 12);
      auto a0 = vld1q_f32(array_ptr + i);
      auto a1 = vld1q_f32(array_ptr + i + 4);
      auto a2 = vld1q_f32(array_ptr + i + 8);
      auto a3 = vld1q_f32(array_ptr + i + 12);
      auto x0 = vaddq_f32(a0, b0);
      auto x1 = vaddq_f32(a1, b1);
      auto x2 = vaddq_f32(a2, b2);
      auto x3 = vaddq_f32(a3, b3);
      x0 = vmaxq_f32(clamp_min_vec, x0);
      x1 = vmaxq_f32(clamp_min_vec, x1);
      x2 = vmaxq_f32(clamp_min_vec, x2);
      x3 = vmaxq_f32(clamp_min_vec, x3);
      x0 = vminq_f32(clamp_max_vec, x0);
      x1 = vminq_f32(clamp_max_vec, x1);
      x2 = vminq_f32(clamp_max_vec, x2);
      x3 = vminq_f32(clamp_max_vec, x3);
      vst1q_f32(array_ptr + i, x0);
      vst1q_f32(array_ptr + i + 4, x1);
      vst1q_f32(array_ptr + i + 8, x2);
      vst1q_f32(array_ptr + i + 12, x3);
    }
    for (; i <= bias_size - 4; i += 4)
    {
      auto b = vld1q_f32(bias_data + i);
      auto a = vld1q_f32(array_ptr + i);
      auto x = vaddq_f32(a, b);
      x = vmaxq_f32(clamp_min_vec, x);
      x = vminq_f32(clamp_max_vec, x);
      vst1q_f32(array_ptr + i, x);
    }
    for (; i < bias_size; i++)
    {
      array_ptr[i] =
        ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
    }
  }
#else // not NEON
  for (int array_offset = 0; array_offset < array_size; array_offset += bias_size)
  {
    for (int i = 0; i < bias_size; i++)
    {
      array_data[array_offset + i] = ActivationFunctionWithMinMax(
        array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
    }
  }
#endif
}

References ActivationFunctionWithMinMax().

Referenced by nnfw::cker::optimized::AddBiasAndEvalActivationFunction(), and nnfw::cker::detail::GemmImplUsingEigen::Run().

◆ BinaryArithmeticOp() [1/4]

template<BinaryArithmeticOpType op_type>

void nnfw::cker::BinaryArithmeticOp	(	const BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const float *	input1_data,
		const Shape &	input2_shape,
		const float *	input2_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 257 of file BinaryArithmeticOps.h.

{
  // Supported type is only float now
  switch (op_type)
  {
    case nnfw::cker::BinaryArithmeticOpType::ADD:
      optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                     output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::MUL:
      optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                     output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::SUB:
      optimized::Sub(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                     output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::DIV:
      optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                     output_data);
      break;
    default:
      assert(false);
      break;
  }
}

References nnfw::cker::optimized::Add(), ADD, nnfw::cker::optimized::Div(), DIV, nnfw::cker::optimized::Mul(), MUL, output_shape, nnfw::cker::optimized::Sub(), and SUB.

◆ BinaryArithmeticOp() [2/4]

template<BinaryArithmeticOpType op_type, typename T >

std::enable_if_t<!is_quant8< T >::value &&!std::is_same< T, bool >::value > nnfw::cker::BinaryArithmeticOp	(	const BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 211 of file BinaryArithmeticOps.h.

{
  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
}

References nnfw::cker::reference::BinaryArithmeticOp(), and output_shape.

◆ BinaryArithmeticOp() [3/4]

template<BinaryArithmeticOpType op_type, typename T >

std::enable_if_t<!is_quant8< T >::value &&std::is_same< T, bool >::value > nnfw::cker::BinaryArithmeticOp	(	const BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 221 of file BinaryArithmeticOps.h.

{
  reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
}

References nnfw::cker::reference::BinaryArithmeticOp(), and output_shape.

◆ BinaryArithmeticOp() [4/4]

template<BinaryArithmeticOpType op_type, typename T >

std::enable_if_t< is_quant8< T >::value > nnfw::cker::BinaryArithmeticOp	(	const BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 231 of file BinaryArithmeticOps.h.

{
  if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::ADD ||
                op_type == nnfw::cker::BinaryArithmeticOpType::SUB)
  {
    optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                   output_data);
  }
  else if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::MUL)
  {
    optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
                   output_data);
  }
  else if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::DIV)
  {
    throw std::runtime_error{"Quant8 Asymm NYI"};
  }
  else
  {
    assert(false);
  }
}

References nnfw::cker::optimized::Add(), ADD, DIV, nnfw::cker::optimized::Mul(), MUL, output_shape, and SUB.

◆ BroadcastBinaryArithmeticOp() [1/3]

template<BinaryArithmeticOpType op_type>

void nnfw::cker::BroadcastBinaryArithmeticOp	(	BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const float *	input1_data,
		const Shape &	input2_shape,
		const float *	input2_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 327 of file BinaryArithmeticOps.h.

{
  if (output_shape.DimensionsCount() > 4)
    throw std::runtime_error(
      std::string("cker::BroadcastBinaryArithmeticOp: Unsupported rank size : ") +
      std::to_string(output_shape.DimensionsCount()));
 
  // Supported type is only float now
  switch (op_type)
  {
    case nnfw::cker::BinaryArithmeticOpType::ADD:
      optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                      output_shape, output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::MUL:
      optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                      output_shape, output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::SUB:
      optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                      output_shape, output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::DIV:
      optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                      output_shape, output_data);
      break;
    case nnfw::cker::BinaryArithmeticOpType::POW:
      reference::BroadcastBinaryArithmeticOpSlow<float>(
        params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
        GetBinaryArtithmeticFn<op_type, float>());
      break;
    default:
      assert(false);
      break;
  }
}

References ADD, nnfw::cker::optimized::BroadcastAddDispatch(), nnfw::cker::optimized::BroadcastDivDispatch(), nnfw::cker::optimized::BroadcastMulDispatch(), nnfw::cker::optimized::BroadcastSubDispatch(), DIV, MUL, output_shape, POW, and SUB.

◆ BroadcastBinaryArithmeticOp() [2/3]

template<BinaryArithmeticOpType op_type, typename T >

std::enable_if_t<!is_quant8< T >::value > nnfw::cker::BroadcastBinaryArithmeticOp	(	BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 289 of file BinaryArithmeticOps.h.

{
  reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                             input2_data, output_shape, output_data,
                                             GetBinaryArtithmeticFn<op_type, T>());
}

References nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), and output_shape.

◆ BroadcastBinaryArithmeticOp() [3/3]

template<BinaryArithmeticOpType op_type, typename T >

std::enable_if_t< is_quant8< T >::value > nnfw::cker::BroadcastBinaryArithmeticOp	(	BinaryArithmeticOpParam &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 300 of file BinaryArithmeticOps.h.

{
  if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::ADD ||
                op_type == nnfw::cker::BinaryArithmeticOpType::SUB)
  {
    optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                    output_shape, output_data);
  }
  else if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::MUL)
  {
    optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
                                    output_shape, output_data);
  }
  else if constexpr (op_type == nnfw::cker::BinaryArithmeticOpType::DIV ||
                     op_type == nnfw::cker::BinaryArithmeticOpType::POW)
  {
    throw std::runtime_error{"Quant8 Asymm NYI"};
  }
  else
  {
    assert(false);
  }
}

References ADD, nnfw::cker::optimized::BroadcastAddDispatch(), nnfw::cker::optimized::BroadcastMulDispatch(), DIV, MUL, output_shape, POW, and SUB.

◆ BroadcastComparison4DSlow()

template<typename T , ComparisonFn< T > F>

void nnfw::cker::BroadcastComparison4DSlow	(	const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 123 of file Comparison.h.

{
  BroadcastComparison4DSlowImpl<T, F>(input1_shape, input1_data, input2_shape, input2_data,
                                      output_shape, output_data);
}

References output_shape.

◆ BroadcastComparison4DSlowImpl()

template<typename T , ComparisonFn< T > F>

void nnfw::cker::BroadcastComparison4DSlowImpl	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		bool *	output_data
	)

inline

Definition at line 91 of file Comparison.h.

{
  assert(unextended_input1_shape.DimensionsCount() <= 4);
  assert(unextended_input2_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          output_data[Offset(output_shape, b, y, x, c)] =
            F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
              input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastComparison4DSlowWithScaling()

template<typename T , ComparisonFn< int32_t > F>

void nnfw::cker::BroadcastComparison4DSlowWithScaling	(	ComparisonParams &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 132 of file Comparison.h.

{
  assert(input1_shape.DimensionsCount() <= 4);
  assert(input2_shape.DimensionsCount() <= 4);
  assert(output_shape.DimensionsCount() <= 4);
 
  int left_shift = params.left_shift;
  int32_t input1_offset = params.input1_offset;
  int32_t input1_multiplier = params.input1_multiplier;
  int input1_shift = params.input1_shift;
  int32_t input2_offset = params.input2_offset;
  int32_t input2_multiplier = params.input2_multiplier;
  int input2_shift = params.input2_shift;
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          const int32_t input1_val =
            input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
          const int32_t input2_val =
            input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
          const int32_t shifted_input1_val = input1_val * (1 << left_shift);
          const int32_t shifted_input2_val = input2_val * (1 << left_shift);
          const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input1_val, input1_multiplier, input1_shift);
          const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
            shifted_input2_val, input2_multiplier, input2_shift);
          output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::ComparisonParams::input1_multiplier, nnfw::cker::ComparisonParams::input1_offset, nnfw::cker::ComparisonParams::input1_shift, nnfw::cker::ComparisonParams::input2_multiplier, nnfw::cker::ComparisonParams::input2_offset, nnfw::cker::ComparisonParams::input2_shift, nnfw::cker::ComparisonParams::left_shift, MultiplyByQuantizedMultiplierSmallerThanOneExp(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastSelect4DSlow()

template<typename D , typename T >

void nnfw::cker::BroadcastSelect4DSlow	(	const Shape &	input_condition_shape,
		const D *	input_condition_data,
		const Shape &	input_x_shape,
		const T *	input_x_data,
		const Shape &	input_y_shape,
		const T *	input_y_data,
		const Shape &	output_shape,
		T *	output_data
	)

Definition at line 63 of file Select.h.

{
  assert(input_condition_shape.DimensionsCount() <= 4);
  assert(input_x_shape.DimensionsCount() <= 4);
  assert(input_y_shape.DimensionsCount() <= 4);
  assert(output_shape.DimensionsCount() <= 4);
 
  const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
 
  NdArrayDesc<4> desc_condition;
  NdArrayDesc<4> desc_x;
  NdArrayDesc<4> desc_y;
  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
                                      &desc_condition, &desc_x, &desc_y);
 
  // In Tensorflow, the dimensions are canonically named (batch_number, row,
  // col, channel), with extents (batches, height, width, depth), with the
  // trailing dimension changing most rapidly (channels has the smallest
  // stride, typically 1 element).
  //
  // In generated C code, we store arrays with the dimensions reversed. The
  // first dimension has smallest stride.
  //
  // We name our variables by their Tensorflow convention, but generate C code
  // nesting loops such that the innermost loop has the smallest stride for
  // the best cache behavior.
  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
        {
          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
          output_data[Offset(extended_output_shape, b, y, x, c)] =
            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ BroadcastTo()

template<typename T >

void nnfw::cker::BroadcastTo	(	const Shape &	input_shape,
		T *	input_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 132 of file BroadcastTo.h.

{
  const int input_flatsize = input_shape.FlatSize();
 
  if (input_shape == output_shape)
  {
    memcpy(output_data, input_data, input_flatsize * sizeof(T));
    return;
  }
 
  // Input shape's rank must be no greater than rank of output shape.
  assert(input_shape.DimensionsCount() <= output_shape.DimensionsCount());
 
  // It shouldn't be 0.
  assert(output_shape.DimensionsCount());
 
  Tensor output_tensor;
  Tensor input_tensor;
 
  input_tensor.shape.ReplaceWith(input_shape.DimensionsCount(), input_shape.DimsData());
  input_tensor.buffer = input_data;
 
  output_tensor.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
  output_tensor.buffer = output_data;
 
  const Eigen::ThreadPoolDevice &device = *eigen_support::GetThreadPoolDevice();
 
  // Handle broadcast from Scalar.
  if (input_flatsize == 0)
  {
    functor::FillFunctor<Eigen::ThreadPoolDevice, T>()(device, output_tensor.flat<T>(),
                                                       input_tensor.scalar<T>());
  }
 
  BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
              /*fewer_dims_optimization=*/true);
 
  // Predict TRUE.
  assert(bcast.IsValid());
  // should be same.
  assert(BCast::ToShape(bcast.output_shape()) == output_shape);
 
  functor::BroadcastTo<Eigen::ThreadPoolDevice, T>()(device, output_tensor, output_shape,
                                                     input_tensor, input_shape, bcast);
}

References nnfw::cker::Tensor::buffer, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), nnfw::cker::Shape::FlatSize(), nnfw::cker::BCast::FromShape(), nnfw::cker::eigen_support::GetThreadPoolDevice(), nnfw::cker::BCastList< N >::IsValid(), output_shape, nnfw::cker::BCast::output_shape(), nnfw::cker::Shape::ReplaceWith(), nnfw::cker::Tensor::scalar(), nnfw::cker::Tensor::shape, and nnfw::cker::BCast::ToShape().

Referenced by nnfw::cker::train::BinaryArithmeticGrad(), and nnfw::cker::train::MeanGrad().

◆ buildStridedSliceParams()

template<typename T >

StridedSliceParams nnfw::cker::buildStridedSliceParams	(	const T *	begin,
		const T *	end,
		const T *	strides,
		const uint32_t	begin_mask,
		const uint32_t	end_mask,
		const uint32_t	shrink_axis_mask,
		const uint8_t	rank
	)

inline

Definition at line 195 of file StridedSlice.h.

{
  StridedSliceParams op_params;
  op_params.start_indices_count = rank;
  op_params.stop_indices_count = rank;
  op_params.strides_count = rank;
 
  for (int i = 0; i < rank; ++i)
  {
    op_params.start_indices[i] = begin[i];
    op_params.stop_indices[i] = end[i];
    op_params.strides[i] = strides[i];
 
    assert(op_params.strides[i] != 0);
  }
 
  op_params.begin_mask = begin_mask;
  op_params.ellipsis_mask = 0; // NYI
  op_params.end_mask = end_mask;
  op_params.new_axis_mask = 0; // NYI
  op_params.shrink_axis_mask = shrink_axis_mask;
 
  assert(sizeof(op_params.begin_mask) * 4 >= rank);
 
  return op_params;
}

◆ CalculateLstmGateFloat()

void nnfw::cker::CalculateLstmGateFloat	(	const float *	input,
		const float *	input_to_gate_weights,
		const float *	aux_input,
		const float *	aux_input_to_gate_weights,
		const float *	output_state,
		const float *	recurrent_to_gate_weights,
		const float *	cell_state,
		const float *	cell_to_gate_weights,
		const float *	layer_norm_coefficients,
		const float *	gate_bias,
		const int	n_batch,
		const int	n_input,
		const int	n_aux_input,
		const int	n_output,
		const int	n_cell,
		const FusedActivationFunctionType	activation,
		float *	gate,
		const bool	is_input_all_zeros,
		const bool	is_aux_input_all_zeros
	)

inline

Definition at line 62 of file LSTM.h.

{
  const bool use_peephole = (cell_to_gate_weights != nullptr);
  const bool use_layer_norm = (layer_norm_coefficients != nullptr);
 
  // Initialize scratch buffers with bias for regular lstm or initialize with
  // zero for layer norm lstm.
  if (use_layer_norm)
  {
    std::fill_n(gate, n_cell * n_batch, 0.0f);
  }
  else
  {
    VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
  }
  // For each batch and cell: compute input_weight * input.
  // Skip if input is all zeros.
  if (!is_input_all_zeros)
  {
    MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch,
                                        gate, /*result_stride=*/1);
  }
  // For each batch and cell: compute aux_input_weight * aux_input.
  // Skip if auxiliary input is not available or all zeros.
  if (!is_aux_input_all_zeros)
  {
    MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
                                        n_batch, gate, /*result_stride=*/1);
  }
  // For each batch and cell: compute recurrent_weight * output_state.
  MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state,
                                      n_batch, gate, /*result_stride=*/1);
  // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
  if (use_peephole)
  {
    VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch,
                                            gate);
  }
  // Do layer normalization (if layer norm LSTM)
  if (use_layer_norm)
  {
    MeanStddevNormalization(gate, gate, n_cell, n_batch);
    VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate);
    VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
  }
  // Apply activation
  ApplyActivationToVector(gate, n_batch * n_cell, activation, gate);
}

References ApplyActivationToVector(), MatrixBatchVectorMultiplyAccumulate(), MeanStddevNormalization(), VectorBatchVectorAdd(), VectorBatchVectorAssign(), VectorBatchVectorCwiseProduct(), and VectorBatchVectorCwiseProductAccumulate().

Referenced by LstmStepFloat().

◆ CalculateLstmOutputFloat()

void nnfw::cker::CalculateLstmOutputFloat	(	int	n_batch,
		int	n_cell,
		int	n_output,
		const float *	cell_state,
		const float *	output_gate,
		FusedActivationFunctionType	activation,
		const float *	projection_weights,
		const float *	projection_bias,
		const float	proj_clip,
		float *	output_state,
		float *	scratch
	)

Definition at line 183 of file LSTM.h.

{
  ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch);
 
  // Define variable for 4th argument to avoid warning
  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
  const float *cwise_product_rhs = scratch;
  VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch);
 
  const bool use_projection = (projection_weights != nullptr);
  const bool use_projection_bias = (projection_bias != nullptr);
 
  if (use_projection)
  {
    if (use_projection_bias)
    {
      VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state);
    }
    else
    {
      std::fill_n(output_state, n_batch * n_output, 0.0f);
    }
    MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch,
                                        output_state, /*result_stride=*/1);
    if (proj_clip > 0.0f)
    {
      CwiseClipping(output_state, n_batch * n_output, proj_clip);
    }
  }
  else
  {
    std::copy_n(scratch, n_batch * n_output, output_state);
  }
}

References ApplyActivationToVector(), CwiseClipping(), MatrixBatchVectorMultiplyAccumulate(), VectorBatchVectorAssign(), and VectorVectorCwiseProduct().

Referenced by LstmStepFloat().

◆ checkMatching()

template<typename... Ts>

bool nnfw::cker::checkMatching	(	const Shape &	shape,
		Ts...	check_shapes
	)

inline

Definition at line 268 of file Shape.h.

{
  auto match = [&shape](const Shape &s) -> bool {
    // Check matching of shapes except the case that both shapes are scalars.
    if (shape.DimensionsCount() > 1 || s.DimensionsCount() > 1 || shape.FlatSize() != 1 ||
        s.FlatSize() != 1)
    {
      if (shape.DimensionsCount() != s.DimensionsCount())
      {
        return false;
      }
      for (int i = 0; i < shape.DimensionsCount(); ++i)
      {
        if (shape.Dims(i) != s.Dims(i))
        {
          return false;
        }
      }
    }
    return true;
  };
 
  // Apply the lambda to each check shape and combine with &&
  return (match(check_shapes) && ...);
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

Referenced by MatchingFlatSize().

◆ checkOutputSize()

void nnfw::cker::checkOutputSize	(	const StridedSliceParams &	op_params,
		const Shape &	input_shape,
		const Shape &	output_shape,
		uint32_t	rank
	)

Definition at line 224 of file StridedSlice.h.

{
  [[maybe_unused]] int32_t shape_size = 0;
 
  for (uint32_t idx = 0; idx < rank; ++idx)
  {
    int32_t stride = op_params.strides[idx];
    int32_t begin = StartForAxis(op_params, input_shape, idx);
    int32_t end = StopForAxis(op_params, input_shape, idx, begin);
 
    // When shrinking an axis, the end position does not matter (and can be
    // incorrect when negative indexing is used, see Issue #19260). Always use
    // begin + 1 to generate a length 1 slice, since begin has
    // already been adjusted for negative indices by StartForAxis.
    const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
    if (shrink_axis)
    {
      end = begin + 1;
    }
 
    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
    dim_shape = dim_shape < 0 ? 0 : dim_shape;
    if (!shrink_axis)
    {
      assert(output_shape.Dims(shape_size) == dim_shape);
      shape_size++;
    }
  }
 
  assert(output_shape.DimensionsCount() == shape_size);
}

References begin, end(), output_shape, nnfw::cker::StridedSliceParams::shrink_axis_mask, StartForAxis(), StopForAxis(), and nnfw::cker::StridedSliceParams::strides.

◆ Clamp()

int nnfw::cker::Clamp	(	const int	v,
		const int	lo,
		const int	hi
	)

inline

Definition at line 32 of file StridedSlice.h.

{
  assert(!(hi < lo));
  if (hi < v)
    return hi;
  if (v < lo)
    return lo;
  return v;
}

Referenced by StartForAxis(), and StopForAxis().

◆ Comparison()

template<ComparisonFn< float > F>

void nnfw::cker::Comparison	(	const Shape &	input1_shape,
		const float *	input1_data,
		const Shape &	input2_shape,
		const float *	input2_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 53 of file Comparison.h.

{
  ComparisonImpl<float, F>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
                           output_data);
}

References output_shape.

◆ ComparisonImpl()

template<typename T , ComparisonFn< T > F>

void nnfw::cker::ComparisonImpl	(	const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 40 of file Comparison.h.

{
  const int64_t flatsize = // number of data....
    MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i)
  {
    output_data[i] = F(input1_data[i], input2_data[i]);
  }
}

References MatchingFlatSize(), and output_shape.

◆ ComparisonWithScaling()

template<typename T , ComparisonFn< int32_t > F>

void nnfw::cker::ComparisonWithScaling	(	ComparisonParams &	params,
		const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 62 of file Comparison.h.

{
  int left_shift = params.left_shift;
  int32_t input1_offset = params.input1_offset;
  int32_t input1_multiplier = params.input1_multiplier;
  int input1_shift = params.input1_shift;
  int32_t input2_offset = params.input2_offset;
  int32_t input2_multiplier = params.input2_multiplier;
  int input2_shift = params.input2_shift;
  const int64_t flatsize = MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i)
  {
    const int32_t input1_val = input1_offset + input1_data[i];
    const int32_t input2_val = input2_offset + input2_data[i];
    const int32_t shifted_input1_val = input1_val * (1 << left_shift);
    const int32_t shifted_input2_val = input2_val * (1 << left_shift);
    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
      shifted_input1_val, input1_multiplier, input1_shift);
    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
      shifted_input2_val, input2_multiplier, input2_shift);
    output_data[i] = F(scaled_input1_val, scaled_input2_val);
  }
}

References nnfw::cker::ComparisonParams::input1_multiplier, nnfw::cker::ComparisonParams::input1_offset, nnfw::cker::ComparisonParams::input1_shift, nnfw::cker::ComparisonParams::input2_multiplier, nnfw::cker::ComparisonParams::input2_offset, nnfw::cker::ComparisonParams::input2_shift, nnfw::cker::ComparisonParams::left_shift, MatchingFlatSize(), MultiplyByQuantizedMultiplierSmallerThanOneExp(), and output_shape.

◆ ComputeBatchIndices()

void nnfw::cker::ComputeBatchIndices	(	const int32_t	output_batch_size,
		const std::vector< int32_t > &	reshape,
		const std::vector< int32_t > &	bcast,
		std::vector< int32_t > *	out_indices
	)

inline

Definition at line 40 of file BCast.h.

{
  // Populates the mapping in out_indices. This algorithm is identical to
  // the following steps:
  //  - Reshape {0, 1, ..., input_batch_size - 1} to the input shape.
  //  - Broadcast to the output shape.
  //  - Reshape back to a flat 1D vector.
  out_indices->resize(output_batch_size);
  int32_t num_output_elements = 1;
  int32_t num_input_elements = 1;
  for (int32_t i = reshape.size() - 1; i >= 0; --i)
  {
    // Replicate the already populated mapping an additional (dim - 1) times.
    // If we are broadcasting, just copy the existing mapping.
    // Otherwise, add another dimension from the input shape.
    const int32_t dim = std::max(reshape[i], bcast[i]);
    const int32_t incr = bcast[i] > 1 ? 0 : num_input_elements;
    for (int32_t k = 0; k < (dim - 1) * num_output_elements; ++k)
    {
      (*out_indices)[num_output_elements + k] = (*out_indices)[k] + incr;
    }
    num_output_elements *= dim;
    num_input_elements *= reshape[i];
  }
}

Referenced by nnfw::cker::BCastList< N >::BCastList().

◆ ComputeInterpolationValues() [1/2]

void nnfw::cker::ComputeInterpolationValues	(	const float	value,
		const float	scale,
		const bool	half_pixel_centers,
		int32_t	input_size,
		float *	scaled_value,
		int32_t *	lower_bound,
		int32_t *	upper_bound
	)

inline

Definition at line 100 of file ResizeBilinear.h.

{
  if (half_pixel_centers)
  {
    *scaled_value = (value + 0.5f) * scale - 0.5f;
  }
  else
  {
    *scaled_value = value * scale;
  }
  float scaled_value_floor = std::floor(*scaled_value);
  *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
  *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
}

Referenced by ResizeBilinear(), ResizeBilinearGeneric(), and ResizeBilinearGenericSmallChannel().

◆ ComputeInterpolationValues() [2/2]

void nnfw::cker::ComputeInterpolationValues	(	const int32_t	value,
		const int32_t	scale_10,
		const bool	half_pixel_centers,
		int32_t	input_size,
		int32_t *	scaled_value,
		int32_t *	lower_bound,
		int32_t *	upper_bound
	)

inline

Definition at line 268 of file ResizeBilinear.h.

{
  if (half_pixel_centers)
  {
    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
  }
  else
  {
    *scaled_value = value * scale_10;
  }
  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
}

◆ Concatenation()

template<typename Scalar >

void nnfw::cker::Concatenation	(	const ConcatenationParams &	params,
		const Shape const	input_shapes,
		const Scalar const	input_data,
		const Shape &	output_shape,
		Scalar *	output_data
	)

inline

Definition at line 33 of file Concatenation.h.

{
  int axis = params.axis;
  int inputs_count = params.inputs_count;
  const int concat_dimensions = output_shape.DimensionsCount();
  assert(axis < concat_dimensions);
 
  [[maybe_unused]] int64_t concat_size = 0;
  for (int i = 0; i < inputs_count; i++)
  {
    assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
    for (int j = 0; j < concat_dimensions; j++)
    {
      if (j != axis)
      {
        [[maybe_unused]] auto dim_checked = MatchingDim(*input_shapes[i], j, output_shape, j);
      }
    }
    concat_size += input_shapes[i]->Dims(axis);
  }
  assert(concat_size == output_shape.Dims(axis));
  int64_t outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= output_shape.Dims(i);
  }
  // For all input arrays,
  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
  int64_t base_inner_size = 1;
  for (int i = axis + 1; i < concat_dimensions; ++i)
  {
    base_inner_size *= output_shape.Dims(i);
  }
 
  Scalar *output_ptr = output_data;
  for (int k = 0; k < outer_size; k++)
  {
    for (int i = 0; i < inputs_count; ++i)
    {
      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
      memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
      output_ptr += copy_size;
    }
  }
}

References nnfw::cker::ConcatenationParams::axis, nnfw::cker::Shape::Dims(), nnfw::cker::ConcatenationParams::inputs_count, MatchingDim(), and output_shape.

◆ ConcatenationWithScaling()

void nnfw::cker::ConcatenationWithScaling	(	const ConcatenationParams &	params,
		const Shape const	input_shapes,
		const uint8_t const	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

inline

Definition at line 83 of file Concatenation.h.

{
  int axis = params.axis;
  const int32_t *input_zeropoint = params.input_zeropoint;
  const float *input_scale = params.input_scale;
  int inputs_count = params.inputs_count;
  const int32_t output_zeropoint = params.output_zeropoint;
  const float output_scale = params.output_scale;
 
  const int concat_dimensions = output_shape.DimensionsCount();
  assert(axis <= concat_dimensions);
 
  [[maybe_unused]] int64_t concat_size = 0;
  for (int i = 0; i < inputs_count; i++)
  {
    assert(input_shapes[i]->DimensionsCount() == concat_dimensions);
    for (int j = 0; j < concat_dimensions; j++)
    {
      if (j != axis)
      {
        assert(input_shapes[i]->Dims(j) == output_shape.Dims(j));
      }
    }
    concat_size += input_shapes[i]->Dims(axis);
  }
  assert(concat_size == output_shape.Dims(axis));
  int64_t outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= output_shape.Dims(i);
  }
  // For all input arrays,
  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
  int64_t base_inner_size = 1;
  for (int i = axis + 1; i < concat_dimensions; ++i)
  {
    base_inner_size *= output_shape.Dims(i);
  }
 
  const float inverse_output_scale = 1.f / output_scale;
  uint8_t *output_ptr = output_data;
  for (int k = 0; k < outer_size; k++)
  {
    for (int i = 0; i < inputs_count; ++i)
    {
      const int copy_size = input_shapes[i]->Dims(axis) * base_inner_size;
      const uint8_t *input_ptr = input_data[i] + k * copy_size;
      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
      {
        memcpy(output_ptr, input_ptr, copy_size);
      }
      else
      {
        const float scale = input_scale[i] * inverse_output_scale;
        const float bias = -input_zeropoint[i] * scale;
        for (int j = 0; j < copy_size; ++j)
        {
          const int32_t value =
            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
        }
      }
      output_ptr += copy_size;
    }
  }
}

References nnfw::cker::ConcatenationParams::axis, nnfw::cker::Shape::Dims(), nnfw::cker::ConcatenationParams::input_scale, nnfw::cker::ConcatenationParams::input_zeropoint, nnfw::cker::ConcatenationParams::inputs_count, nnfw::cker::ConcatenationParams::output_scale, output_shape, and nnfw::cker::ConcatenationParams::output_zeropoint.

Referenced by onert::backend::cpu::ops::ConcatLayer::concatenationQuant8().

◆ CopyDimsToDesc()

template<int N>

void nnfw::cker::CopyDimsToDesc	(	const Shape &	input_shape,
		NdArrayDesc< N > *	desc_out
	)

inline

Definition at line 277 of file Utils.h.

{
  int desc_stride = 1;
  for (int i = N - 1; i >= 0; --i)
  {
    desc_out->extents[i] = input_shape.Dims(i);
    desc_out->strides[i] = desc_stride;
    desc_stride *= input_shape.Dims(i);
  }
}

References nnfw::cker::Shape::Dims(), nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

◆ CopyMultipleTimes()

template<typename T , typename M >

void nnfw::cker::CopyMultipleTimes	(	const T *	in_data,
		int32_t	in_size,
		M	multiplier,
		T *	out_data
	)

Definition at line 29 of file Tile.h.

{
  for (M i = 0; i < multiplier; ++i)
  {
    const T *in_end = in_data + in_size;
    T *new_out_data = std::copy(in_data, in_end, out_data);
    in_data = out_data;
    out_data = new_out_data;
  }
}

Referenced by TileOneDimension().

◆ Cos()

void nnfw::cker::Cos	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 41 of file Elementwise.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = std::cos(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ CountLeadingZeros()

int nnfw::cker::CountLeadingZeros ( uint32_t integer_input )

inline

Definition at line 152 of file Utils.h.

{
  const uint32_t one_in_leading_positive = 1U << 31;
  int leading_zeros = 0;
  while (integer_input < one_in_leading_positive)
  {
    integer_input <<= 1;
    ++leading_zeros;
  }
  return leading_zeros;
}

Referenced by GetInvSqrtQuantizedMultiplierExp().

◆ CwiseClipping()

void nnfw::cker::CwiseClipping	(	float *	vector,
		const int	v_size,
		const float	clipping_value
	)

inline

Definition at line 34 of file TensorUtils.h.

{
  NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
}

References CwiseClipping(), and NEON_OR_PORTABLE.

Referenced by CalculateLstmOutputFloat(), CwiseClipping(), and UpdateLstmCellFloat().

◆ DepthToSpace()

template<typename T >

void nnfw::cker::DepthToSpace	(	const Shape &	unextended_input_shape,
		const T *	input_data,
		const Shape &	unextended_output_shape,
		T *	output_data,
		int32_t	block_size
	)

inline

Definition at line 30 of file DepthToSpace.h.

{
  assert(unextended_input_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  const int input_depth = input_shape.Dims(3);
  const int input_width = input_shape.Dims(2);
  const int input_height = input_shape.Dims(1);
 
  const int output_depth = output_shape.Dims(3);
  const int batch_size = output_shape.Dims(0);
 
  // Number of continuous values that we can copy in one interation.
  const int stride = block_size * output_depth;
 
  for (int batch = 0; batch < batch_size; ++batch)
  {
    for (int in_h = 0; in_h < input_height; ++in_h)
    {
      const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
      for (int offset_h = 0; offset_h < block_size; ++offset_h)
      {
        const T *src = input_ptr;
        for (int in_w = 0; in_w < input_width; ++in_w)
        {
          memcpy(output_data, src, stride * sizeof(T));
          output_data += stride;
          src += input_depth;
        }
        input_ptr += stride;
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

◆ DepthwiseConv()

template<typename T , typename TS >

void nnfw::cker::DepthwiseConv	(	const DepthwiseConvParams &	params,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	filter_shape,
		const T *	filter_data,
		const Shape &	bias_shape,
		const TS *	bias_data,
		const Shape &	output_shape,
		T *	output_data,
		ruy::Context *	ruy_context
	)

inline

Definition at line 124 of file DepthwiseConv.h.

{
  assert(input_shape.DimensionsCount() == 4);
  assert(filter_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
 
  int thread_count = HowManyConvThreads(output_shape, filter_shape);
 
  // NOTE Borrow RuyContext to get max_num_threads setting
  // TODO Define and use max_num_threads for CPU backend
  const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
 
  thread_count = std::max(1, std::min(thread_count, max_threads));
  // Cap the number of threads to 2 for float path to avoid regression in
  // performance (b/132294857).
  if constexpr (std::is_floating_point<T>::value)
  {
    thread_count = std::min(thread_count, 2);
  }
 
  const int output_batches = output_shape.Dims(0);
  const int output_height = output_shape.Dims(1);
 
  if (thread_count == 1)
  {
    optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
                                 bias_shape, bias_data, output_shape, output_data, 0, output_height,
                                 1);
    return;
  }
 
  int thread_dim, thread_dim_size;
  if (MultithreadAlongBatches(thread_count, output_batches))
  {
    thread_dim = 0;
    thread_dim_size = output_batches;
  }
  else
  {
    thread_dim = 1;
    thread_dim_size = output_height;
  }
 
  std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
  // TODO(b/131746020) don't create new heap allocations every time.
  // At least we make it a single heap allocation by using reserve().
  tasks.reserve(thread_count);
  int thread_start = 0;
  for (int i = 0; i < thread_count; ++i)
  {
    int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
    tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
                       bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
    thread_start = thread_end;
  }
  cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
}

References nnfw::cker::optimized::DepthwiseConvImpl(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::cpu_backend_threadpool::Execute(), HowManyConvThreads(), MultithreadAlongBatches(), and output_shape.

◆ DepthwiseConvOp()

void nnfw::cker::DepthwiseConvOp	(	const DepthwiseConvParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	filter_shape,
		const float *	filter_data,
		const Shape &	bias_shape,
		const float *	bias_data,
		float *	padded_filter_data,
		bool	pad_filter,
		float *	filter_buffers_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 185 of file DepthwiseConv.h.

{
  if (params.stride_height != params.stride_width)
    throw std::runtime_error("Not support different length strides");
 
  if (params.dilation_height_factor != 1 || params.dilation_width_factor != 1)
    throw std::runtime_error{"Not support dilation other than 1."};
 
  const int batch = MatchingDim(input_shape, 0, output_shape, 0);
  const int input_depth = input_shape.Dims(3);
  const int output_depth = output_shape.Dims(3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride = params.stride_height;
  const int depth_multiplier = params.depth_multiplier;
  const int pad_height = params.padding_values.height;
  const int pad_width = params.padding_values.width;
  const float activation_min = params.float_activation_min;
  const float activation_max = params.float_activation_max;
 
  depthwise_conv_op::LaunchDepthwiseConvOp<Eigen::ThreadPoolDevice, float>()(
    batch, input_height, input_width, input_depth, filter_height, filter_width, depth_multiplier,
    stride, pad_height, pad_width, output_height, output_width, output_depth, input_data,
    filter_data, padded_filter_data, pad_filter, filter_buffers_data, output_data);
 
  if (bias_data != nullptr)
  {
    bias_op::biasHelper<float>(bias_shape, bias_data, output_shape, output_data, activation_min,
                               activation_max);
  }
}

References nnfw::cker::DepthwiseConvParams::depth_multiplier, nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::Dims(), nnfw::cker::DepthwiseConvParams::float_activation_max, nnfw::cker::DepthwiseConvParams::float_activation_min, nnfw::cker::PaddingValues::height, MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convFloat32().

◆ Dequantize() [1/3]

void nnfw::cker::Dequantize	(	const Shape &	input_shape,
		const int16_t *	input_data,
		const Shape &	output_shape,
		float *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 115 of file Dequantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
  for (; i <= flat_size - 8; i += 8)
  {
    const int16x4_t input_s16_low = vld1_s16(input_data + i);
    const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
    const int32x4_t val_low = vmovl_s16(input_s16_low);
    const int32x4_t val_high = vmovl_s16(input_s16_high);
 
    float32x4_t result_low, result_high;
    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
 
    vst1q_f32(output_data + i, result_low);
    vst1q_f32(output_data + i + 4, result_high);
  }
#endif // NEON
  for (; i < flat_size; ++i)
  {
    const int32_t val = input_data[i];
    const float result = static_cast<float>(scale * (val - zero_point));
    output_data[i] = result;
  }
}

References MatchingFlatSize(), and output_shape.

◆ Dequantize() [2/3]

void nnfw::cker::Dequantize	(	const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	output_shape,
		float *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 80 of file Dequantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
  for (; i <= flat_size - 8; i += 8)
  {
    const int8x8_t input_s8 = vld1_s8(input_data + i);
    const int16x8_t input_s16 = vmovl_s8(input_s8);
    const int16x4_t input_s16_low = vget_low_s16(input_s16);
    const int16x4_t input_s16_high = vget_high_s16(input_s16);
    const int32x4_t val_low = vmovl_s16(input_s16_low);
    const int32x4_t val_high = vmovl_s16(input_s16_high);
 
    float32x4_t result_low, result_high;
    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
 
    vst1q_f32(output_data + i, result_low);
    vst1q_f32(output_data + i + 4, result_high);
  }
#endif // NEON
  for (; i < flat_size; ++i)
  {
    const int32_t val = input_data[i];
    const float result = static_cast<float>(scale * (val - zero_point));
    output_data[i] = result;
  }
}

References MatchingFlatSize(), and output_shape.

◆ Dequantize() [3/3]

void nnfw::cker::Dequantize	(	const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		float *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 44 of file Dequantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
  const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
  for (; i <= flat_size - 8; i += 8)
  {
    const uint8x8_t input_u8 = vld1_u8(input_data + i);
    const uint16x8_t input_u16 = vmovl_u8(input_u8);
    const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
    const int16x4_t input_s16_low = vget_low_s16(input_s16);
    const int16x4_t input_s16_high = vget_high_s16(input_s16);
    const int32x4_t val_low = vmovl_s16(input_s16_low);
    const int32x4_t val_high = vmovl_s16(input_s16_high);
 
    float32x4_t result_low, result_high;
    ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
    ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
 
    vst1q_f32(output_data + i, result_low);
    vst1q_f32(output_data + i + 4, result_high);
  }
#endif // NEON
  for (; i < flat_size; ++i)
  {
    const int32_t val = input_data[i];
    const float result = static_cast<float>(scale * (val - zero_point));
    output_data[i] = result;
  }
}

References MatchingFlatSize(), and output_shape.

◆ ELU()

void nnfw::cker::ELU	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 30 of file ELU.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i)
  {
    const float val = input_data[i];
    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
  }
}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ end()

ShapeIterator nnfw::cker::end ( const Shape & s )

inline

Definition at line 89 of file ShapeIterator.h.

89{ return ShapeIterator::end_iterator(s); }

References nnfw::cker::ShapeIterator::end_iterator().

Referenced by buildStridedSliceParams(), checkOutputSize(), nnfw::cker::xent_ops::functor::XentFunctorBase< Device, T >::operator()(), nnfw::cker::training_ops::functor::ApplyAdamNonCuda< Device, T >::operator()(), operator<<(), and StridedSlice().

◆ EqualFn()

template<typename T >

bool nnfw::cker::EqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 30 of file Comparison.h.

30{ return lhs == rhs; }

◆ Erf()

void nnfw::cker::Erf	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 29 of file Erf.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = std::erf(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ Exp()

void nnfw::cker::Exp	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 30 of file Exp.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = std::exp(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ Fill() [1/2]

template<typename T >

void nnfw::cker::Fill	(	const T *	value_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 28 of file Fill.h.

{
  int output_size = output_shape.FlatSize();
  for (int i = 0; i < output_size; i++)
  {
    output_data[i] = *value_data;
  }
}

References output_shape.

◆ Fill() [2/2]

template<typename Device , class Distribution >

void nnfw::cker::Fill	(	random::PhiloxRandom	random,
		Tensor *	output
	)

Definition at line 64 of file StatelessRandomUniform.h.

{
  // Build distribution
  typedef typename Distribution::ResultElementType T;
 
  auto flat = output->flat<T>();
  // Reuse the compute kernels from the stateful random ops
  functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(),
                                                    Distribution());
}

◆ FlatSizeSkipDim()

int nnfw::cker::FlatSizeSkipDim	(	const Shape &	shape,
		int	skip_dim
	)

inline

Definition at line 253 of file Shape.h.

{
  const int dims_count = shape.DimensionsCount();
  assert(skip_dim >= 0 && skip_dim < dims_count);
  const auto *dims_data = shape.DimsData();
  int flat_size = 1;
  for (int i = 0; i < dims_count; ++i)
  {
    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
  }
  return flat_size;
}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsData().

Referenced by nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::DepthwiseConvImpl(), FullyConnected(), FullyConnectedSparseWeight16x1(), FullyConnectedSparseWeightRandom(), nnfw::cker::optimized_integer_ops::HowManyConvThreads(), MapAsMatrixWithLastDimAsRows(), MatchingFlatSizeSkipDim(), nnfw::cker::train::MSE(), and nnfw::cker::train::MSEGrad().

◆ Floor()

void nnfw::cker::Floor	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 90 of file Elementwise.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  for (int i = 0; i < flat_size; i++)
  {
    output_data[i] = std::floor(input_data[i]);
  }
}

References MatchingFlatSize(), and output_shape.

◆ FloorDivBroadcast()

template<typename T >

void nnfw::cker::FloorDivBroadcast	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 30 of file FloorDiv.h.

{
  assert(unextended_input1_shape.DimensionsCount() <= 4);
  assert(unextended_input2_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          auto out_idx = Offset(output_shape, b, y, x, c);
          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
          output_data[out_idx] = std::floor(
            std::divides<double>()(static_cast<double>(in1_val), static_cast<double>(in2_val)));
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ FloorDivElementwise()

template<typename T >

void nnfw::cker::FloorDivElementwise	(	const Shape &	shape,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 66 of file FloorDiv.h.

{
 
  int num_elements = shape.FlatSize();
 
  for (int t = 0; t < num_elements; t++)
  {
    output_data[t] = std::floor(std::divides<double>()(static_cast<double>(input1_data[t]),
                                                       static_cast<double>(input2_data[t])));
  }
}

References nnfw::cker::Shape::FlatSize().

◆ FloorModBroadcast()

template<typename T >

void nnfw::cker::FloorModBroadcast	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 34 of file FloorMod.h.

{
  struct FloatMod
  {
    float operator()(const float lhs, const float rhs) const { return std::fmod(lhs, rhs); }
  };
 
  if (unextended_output_shape.DimensionsCount() > 4)
    throw std::runtime_error(std::string("cker::FloorModBroadcast: Unsupported rank size : ") +
                             std::to_string(unextended_output_shape.DimensionsCount()));
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          auto out_idx = Offset(output_shape, b, y, x, c);
          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
 
          T trunc_mod;
          if constexpr (std::is_integral_v<T>)
          {
            trunc_mod = std::modulus<T>()(in1_val, in2_val);
          }
          else
          {
            trunc_mod = FloatMod{}(in1_val, in2_val);
          }
          output_data[out_idx] = (trunc_mod != 0) && ((in2_val < 0) != (trunc_mod < 0))
                                   ? (trunc_mod + in2_val)
                                   : trunc_mod;
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ FloorModElementwise()

template<typename T >

void nnfw::cker::FloorModElementwise	(	const Shape &	shape,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 86 of file FloorMod.h.

{
  int num_elements = shape.FlatSize();
  for (int t = 0; t < num_elements; t++)
  {
    T in1_val = input1_data[t];
    T in2_val = input2_data[t];
    T trunc_mod;
    if constexpr (std::is_integral_v<T>)
    {
      trunc_mod = std::modulus<T>()(in1_val, in2_val);
    }
    else
    {
      trunc_mod = std::fmod(in1_val, in2_val);
    }
    output_data[t] =
      (trunc_mod != 0) && ((in2_val < 0) != (trunc_mod < 0)) ? (trunc_mod + in2_val) : trunc_mod;
  }
}

References nnfw::cker::Shape::FlatSize().

◆ FullyConnected() [1/2]

void nnfw::cker::FullyConnected	(	const FullyConnectedParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	weights_shape,
		const float *	weights_data,
		const Shape &	,
		const float *	bias_data,
		const Shape &	,
		float *	output_data
	)

inline

Definition at line 98 of file FullyConnected.h.

{
  int total_input_size = input_shape.FlatSize();
  int input_size = weights_shape.Dims(1);
  const int batch_size = total_input_size / input_size;
  const int num_units = weights_shape.Dims(0);
 
  // Output = bias if bias tensor exists.
  if (bias_data)
  {
    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
  }
  else
  {
    ZeroVector(output_data, batch_size * num_units);
  }
 
  // Compute output += weight * input
  MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
                                      output_data, /*result_stride=*/1);
 
  if (params.activation != FusedActivationFunctionType::kNone)
  {
    // Apply activation function
    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
  }
}

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), kNone, MatrixBatchVectorMultiplyAccumulate(), VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedFloat32(), and onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedQuant8().

◆ FullyConnected() [2/2]

void nnfw::cker::FullyConnected	(	const FullyConnectedParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	filter_shape,
		const uint8_t *	filter_data,
		const Shape &	bias_shape,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

inline

Definition at line 131 of file FullyConnected.h.

{
  const int32_t input_offset = params.input_offset;
  const int32_t filter_offset = params.weights_offset;
  const int32_t output_offset = params.output_offset;
  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
  assert(filter_shape.DimensionsCount() >= 2);
  assert(output_shape.DimensionsCount() >= 1);
 
  assert(output_activation_min <= output_activation_max);
  // TODO(benoitjacob): This really should be:
  //     const int batches = ArraySize(output_dims, 1);
  // but the current --variable_batch hack consists in overwriting the 3rd
  // dimension with the runtime batch size, as we don't keep track for each
  // array of which dimension is the batch dimension in it.
  const int output_dim_count = output_shape.DimensionsCount();
  const int filter_dim_count = filter_shape.DimensionsCount();
  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
  const int output_depth =
    MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  for (int b = 0; b < batches; ++b)
  {
    for (int out_c = 0; out_c < output_depth; ++out_c)
    {
      int32_t acc = 0;
      for (int d = 0; d < accum_depth; ++d)
      {
        int32_t input_val = input_data[b * accum_depth + d];
        int32_t filter_val = filter_data[out_c * accum_depth + d];
        acc += (filter_val + filter_offset) * (input_val + input_offset);
      }
      if (bias_data)
      {
        acc += bias_data[out_c];
      }
      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
      acc += output_offset;
      acc = std::max(acc, output_activation_min);
      acc = std::min(acc, output_activation_max);
      output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), nnfw::cker::FullyConnectedParams::input_offset, MatchingDim(), MultiplyByQuantizedMultiplier(), nnfw::cker::FullyConnectedParams::output_multiplier, nnfw::cker::FullyConnectedParams::output_offset, output_shape, nnfw::cker::FullyConnectedParams::output_shift, nnfw::cker::FullyConnectedParams::quantized_activation_max, nnfw::cker::FullyConnectedParams::quantized_activation_min, and nnfw::cker::FullyConnectedParams::weights_offset.

◆ FullyConnectedHybrid()

void nnfw::cker::FullyConnectedHybrid	(	const FullyConnectedParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	filter_shape,
		const int8_t *	filter_data,
		const Shape &	,
		const float *	bias_data,
		const Shape &	output_shape,
		float *	output_data,
		FCTempArena &	temp_arena,
		ruy::Context *	ruy_context
	)

inline

Definition at line 183 of file FullyConnected.h.

{
  int total_input_size = input_shape.FlatSize();
  const int input_size = filter_shape.Dims(1);
  const int batch_size = total_input_size / input_size;
  const int num_units = filter_shape.Dims(0);
 
  // Output = bias if bias tensor exists.
  if (bias_data)
  {
    VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
  }
  else
  {
    ZeroVector(output_data, batch_size * num_units);
  }
 
  // Save matrix multiplication computation for all zero input.
  if (IsZeroVector(input_data, total_input_size))
  {
    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
    return;
  }
 
  // Quantize input from float to uint8 + quantization params (scaling factor).
  float unused_min, unused_max;
  float *scaling_factors_ptr = temp_arena.scaling_factors.data();
  int8_t *quant_data = temp_arena.input_quantized.data();
 
  // Quantize each batch independently.
  for (int b = 0; b < batch_size; ++b)
  {
    const int offset = b * input_size;
    SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
                            &unused_max, &scaling_factors_ptr[b]);
    // Incorporate scaling of the filter.
    scaling_factors_ptr[b] *= params.weights_scale;
  }
 
// Compute output += weight * quantized_input
#ifdef USE_RUY_GEMV
  auto output_size = output_shape.FlatSize();
  temp_arena.accum_scratch.resize(output_size);
  int32_t *scratch = temp_arena.accum_scratch.data();
  MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
                                      scaling_factors_ptr, batch_size, scratch, output_data,
                                      /*result_stride=*/1, ruy_context);
#else
  MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
                                      scaling_factors_ptr, batch_size, output_data,
                                      /*result_stride=*/1);
#endif
 
  // Apply activation function to floats.
  if (params.activation != FusedActivationFunctionType::kNone)
  {
    // Apply activation function
    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
  }
  return;
}

References nnfw::cker::FCTempArena::accum_scratch, nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), nnfw::cker::FCTempArena::input_quantized, IsZeroVector(), kNone, MatrixBatchVectorMultiplyAccumulate(), offset(), output_shape, nnfw::cker::FCTempArena::scaling_factors, SymmetricQuantizeFloats(), VectorBatchVectorAssign(), nnfw::cker::FullyConnectedParams::weights_scale, and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid().

◆ FullyConnectedSparseWeight16x1()

void nnfw::cker::FullyConnectedSparseWeight16x1	(	const FullyConnectedParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	weights_shape,
		const float *	weights_data,
		const Shape &	bias_shape,
		const float *	bias_data,
		const Shape &	output_shape,
		float *	output_data,
		const uint16_t *	w1_segments,
		const uint16_t *	w1_indices
	)

inline

Definition at line 57 of file FullyConnectedSparse16x1.h.

{
  assert(weights_shape.DimensionsCount() == 2);
  assert(output_shape.DimensionsCount() == 2);
 
  const int output_dims_count = output_shape.DimensionsCount();
  const int weights_dims_count = weights_shape.DimensionsCount();
  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
  const int output_depth =
    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
  if (bias_data)
  {
    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
  }
  else
  {
    ZeroVector(output_data, batches * output_depth);
  }
  for (int b = 0; b < batches; ++b)
  {
    int depth_size = output_depth / 16;
    for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
#ifdef USE_NEON
    {
      float *__restrict y;
      y = &output_data[b * output_depth + idx_0 * 16];
      /* keep y[0..15] in registers for duration of inner loop */
      float32x4_t y0_3 = vld1q_f32(&y[0]);
      float32x4_t y4_7 = vld1q_f32(&y[4]);
      float32x4_t y8_11 = vld1q_f32(&y[8]);
      float32x4_t y12_15 = vld1q_f32(&y[12]);
      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
      {
        auto idx_1 = w1_indices[pw1];
        float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
        float32x4_t wvec;
 
        wvec = vld1q_f32(&weights_data[0]);
        y0_3 = vmlaq_f32(y0_3, wvec, xj);
        wvec = vld1q_f32(&weights_data[4]);
        y4_7 = vmlaq_f32(y4_7, wvec, xj);
        wvec = vld1q_f32(&weights_data[8]);
        y8_11 = vmlaq_f32(y8_11, wvec, xj);
        wvec = vld1q_f32(&weights_data[12]);
        y12_15 = vmlaq_f32(y12_15, wvec, xj);
 
        weights_data += 16;
      }
      /* save y[0..15] back to memory */
      vst1q_f32(&y[0], y0_3);
      vst1q_f32(&y[4], y4_7);
      vst1q_f32(&y[8], y8_11);
      vst1q_f32(&y[12], y12_15);
    }
#else
    {
      for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
      {
        float *__restrict y;
        float xj;
        auto idx_1 = w1_indices[pw1];
        xj = input_data[b * accum_depth + idx_1];
        y = &output_data[b * output_depth + idx_0 * 16];
        y[0] += weights_data[0] * xj;
        y[1] += weights_data[1] * xj;
        y[2] += weights_data[2] * xj;
        y[3] += weights_data[3] * xj;
        y[4] += weights_data[4] * xj;
        y[5] += weights_data[5] * xj;
        y[6] += weights_data[6] * xj;
        y[7] += weights_data[7] * xj;
        y[8] += weights_data[8] * xj;
        y[9] += weights_data[9] * xj;
        y[10] += weights_data[10] * xj;
        y[11] += weights_data[11] * xj;
        y[12] += weights_data[12] * xj;
        y[13] += weights_data[13] * xj;
        y[14] += weights_data[14] * xj;
        y[15] += weights_data[15] * xj;
        weights_data += 16;
      }
    }
#endif
  }
  if (params.activation != FusedActivationFunctionType::kNone)
  {
    // Apply activation function
    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
  }
}

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), kNone, MatchingDim(), output_shape, VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight().

◆ FullyConnectedSparseWeightRandom()

void nnfw::cker::FullyConnectedSparseWeightRandom	(	const FullyConnectedParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	weights_shape,
		const float *	weights_data,
		const Shape &	bias_shape,
		const float *	bias_data,
		const Shape &	output_shape,
		float *	output_data,
		const uint16_t *	w1_segments,
		const uint16_t *	w1_indices
	)

inline

Definition at line 250 of file FullyConnected.h.

{
 
  assert(weights_shape.DimensionsCount() == 2);
  assert(output_shape.DimensionsCount() == 2);
 
  const int output_dims_count = output_shape.DimensionsCount();
  const int weights_dims_count = weights_shape.DimensionsCount();
  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
  const int output_depth =
    MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 
  if (bias_data)
  {
    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
  }
  else
  {
    ZeroVector(output_data, batches * output_depth);
  }
  for (int b = 0; b < batches; ++b)
  {
    for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
    {
      for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
      {
        int idx_1 = w1_indices[pw1];
        output_data[b * output_depth + idx_0] +=
          weights_data[pw1] * input_data[b * accum_depth + idx_1];
      }
    }
  }
  if (params.activation != FusedActivationFunctionType::kNone)
  {
    // Apply activation function
    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
  }
}

References nnfw::cker::FullyConnectedParams::activation, ApplyActivationToVector(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), FlatSizeSkipDim(), kNone, MatchingDim(), output_shape, VectorBatchVectorAssign(), and ZeroVector().

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight().

◆ Gather()

template<typename T , typename CoordsT = int32_t>

void nnfw::cker::Gather	(	const GatherParams &	op_params,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	coords_shape,
		const CoordsT *	coords_data,
		const Shape &	,
		T *	output_data
	)

inline

Definition at line 31 of file Gather.h.

{
  int axis = op_params.axis;
  if (axis < 0)
  {
    axis += input_shape.DimensionsCount();
  }
  assert(axis >= 0);
  assert(axis < input_shape.DimensionsCount());
  const int axis_size = input_shape.Dims(axis);
  const int coords_count = coords_shape.FlatSize();
 
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
 
  int inner_size = 1;
  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
  {
    inner_size *= input_shape.Dims(i);
  }
 
  for (int outer = 0; outer < outer_size; ++outer)
  {
    for (int i = 0; i < coords_count; ++i)
    {
      assert(coords_data[i] >= 0);
      assert(coords_data[i] < axis_size);
      std::memcpy(output_data + (outer * coords_count + i) * inner_size,
                  input_data + (outer * axis_size + coords_data[i]) * inner_size,
                  sizeof(T) * inner_size);
    }
  }
}

References nnfw::cker::GatherParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

◆ GenerateKey()

void nnfw::cker::GenerateKey	(	Tensor	seed,
		random::PhiloxRandom::Key *	out_key,
		random::PhiloxRandom::ResultType *	out_counter
	)

Definition at line 37 of file StatelessRandomUniform.h.

{
  // Grab the two seeds
  uint32_t seed0;
  uint32_t seed1;
 
  const auto seed_vals = seed.flat<int32_t>();
 
  seed0 = seed_vals(0);
  seed1 = seed_vals(1);
  // Scramble the seeds so that the user doesn't need to worry about which
  // part of the seed needs to be strong.
  (*out_key)[0] = 0x3ec8f720;
  (*out_key)[1] = 0x02461e29;
  (*out_counter)[0] = static_cast<uint32_t>(seed0);
  (*out_counter)[1] = (*out_counter)[3] = 0;
  (*out_counter)[2] = static_cast<uint32_t>(seed1);
  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
  (*out_key)[0] = mix[0];
  (*out_key)[1] = mix[1];
  (*out_counter)[0] = (*out_counter)[1] = 0;
  (*out_counter)[2] = mix[2];
  (*out_counter)[3] = mix[3];
}

References nnfw::cker::Tensor::flat().

Referenced by StatelessRandomUniform().

◆ GetIndexRange()

void nnfw::cker::GetIndexRange	(	int	spatial_index_dim,
		int	block_shape_dim,
		int	input_dim,
		int	output_dim,
		int *	start_index,
		int *	end_index
	)

inline

Definition at line 37 of file BatchToSpaceND.h.

{
  // (*start_index) * block_shape_dim is effectively rounded up to the next
  // multiple of block_shape_dim by the integer division.
  *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
  // end_index is exclusive).
  *end_index =
    std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
}

Referenced by BatchToSpaceND().

◆ GetInvSqrtQuantizedMultiplierExp()

void nnfw::cker::GetInvSqrtQuantizedMultiplierExp	(	int32_t	input,
		int	reverse_shift,
		int32_t *	output_inv_sqrt,
		int *	output_shift
	)

inline

Definition at line 164 of file Utils.h.

{
  assert(input >= 0);
  if (input <= 1)
  {
    // Handle the input value 1 separately to avoid overflow in that case
    // in the general computation below (b/143972021). Also handle 0 as if it
    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
    // but rare/unrealistic input value. We can expect both to occur in some
    // incompletely trained models, but probably not in fully trained models.
    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
    *output_shift = 0;
    return;
  }
  assert(input > 1);
  *output_shift = 11;
  while (input >= (1 << 29))
  {
    input /= 4;
    ++*output_shift;
  }
  const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
  *output_shift -= left_shift_bit_pairs;
  input <<= 2 * left_shift_bit_pairs;
  assert(input >= (1 << 27));
  assert(input < (1 << 29));
  using gemmlowp::FixedPoint;
  using gemmlowp::Rescale;
  using gemmlowp::SaturatingRoundingMultiplyByPOT;
  // Using 3 integer bits gives us enough room for the internal arithmetic in
  // this Newton-Raphson iteration.
  using F3 = FixedPoint<int32_t, 3>;
  using F0 = FixedPoint<int32_t, 0>;
  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
  const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
  const F3 fixedpoint_half_three =
    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
  // Newton-Raphson iteration
  // Naive unoptimized starting guess: x = 1
  F3 x = F3::One();
  // Naive unoptimized number of iterations: 5
  for (int i = 0; i < 5; i++)
  {
    const F3 x3 = Rescale<3>(x * x * x);
    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
  }
  const F0 fixedpoint_half_sqrt_2 =
    GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
  x = x * fixedpoint_half_sqrt_2;
  *output_inv_sqrt = x.raw();
  if (*output_shift < 0)
  {
    *output_inv_sqrt <<= -*output_shift;
    *output_shift = 0;
  }
  // Convert right shift (right is positive) to left shift.
  *output_shift *= reverse_shift;
}

References CountLeadingZeros().

Referenced by L2NormalizeQuant8().

◆ GetShape()

Shape nnfw::cker::GetShape ( const std::vector< int32_t > & data )

inline

Definition at line 235 of file Shape.h.

235{ return Shape(data.size(), data.data()); }

◆ GetSize()

template<typename T >

int nnfw::cker::GetSize	(	T	start,
		T	limit,
		T	delta
	)

inline

Definition at line 30 of file Range.h.

{
  if (!((start > limit && delta < 0) || (start < limit && delta > 0)))
  {
    throw std::runtime_error("Range: invalid input values");
  }
 
  if constexpr (std::is_integral_v<T>)
  {
    return ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta));
  }
  else
  {
    return static_cast<int>(std::ceil(std::abs((limit - start) / delta)));
  }
}

◆ GreaterEqualFn()

template<typename T >

bool nnfw::cker::GreaterEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 33 of file Comparison.h.

33{ return lhs >= rhs; }

◆ GreaterFn()

template<typename T >

bool nnfw::cker::GreaterFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 32 of file Comparison.h.

32{ return lhs > rhs; }

◆ HowManyConvThreads()

int nnfw::cker::HowManyConvThreads	(	const Shape &	output_shape,
		const Shape &	filter_shape
	)

inline

Definition at line 81 of file DepthwiseConv.h.

{
  // How many scalar multiplications are needed to make it worth using one
  // more thread
  static constexpr int kMinMulPerThread = 1 << 13; // 8k
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
  // Try to avoid real runtime divisions if possible by dividing by a
  // compile-time constant.
  int thread_count = std::max(1, num_muls / kMinMulPerThread);
  return thread_count;
}

References nnfw::cker::Shape::Dims(), and output_shape.

Referenced by DepthwiseConv().

◆ InitTensorDataForReduce()

template<typename T >

bool nnfw::cker::InitTensorDataForReduce	(	const Shape &	shape,
		const T	init_value,
		T *	data
	)

inline

Definition at line 208 of file Reduce.h.

{
  const auto dims = shape.DimsData();
  const auto num_dims = shape.DimensionsCount();
  size_t num_elements = 1;
  for (int idx = 0; idx < num_dims; ++idx)
  {
    size_t current = static_cast<size_t>(dims[idx]);
    // Overflow prevention.
    if (num_elements > std::numeric_limits<size_t>::max() / current)
    {
      return false;
    }
    num_elements *= current;
  }
  for (size_t idx = 0; idx < num_elements; ++idx)
  {
    data[idx] = init_value;
  }
  return true;
}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsData().

Referenced by nnfw::cker::ReduceMean::PrepareforReduce(), and nnfw::cker::Reduce::ReduceGeneric().

◆ InstanceNorm()

void nnfw::cker::InstanceNorm	(	const InstanceNormParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	gamma_shape,
		const float *	gamma_data,
		const Shape &	beta_shape,
		const float *	beta_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 31 of file InstanceNorm.h.

{
  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
  const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
  const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
  const float output_activation_min = params.float_activation_min;
  const float output_activation_max = params.float_activation_max;
 
  assert(output_activation_min <= output_activation_max);
 
  for (int32_t batch = 0; batch < batches; batch++)
  {
    for (int32_t channel = 0; channel < channels; channel++)
    {
      double sum = 0.0f;
      double square_sum = 0.0f;
      int32_t size = heights * widths;
 
      for (int32_t height = 0; height < heights; height++)
      {
        for (int32_t width = 0; width < widths; width++)
        {
          double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
          sum += input_val;
          square_sum += (input_val * input_val);
        }
      }
 
      double mean = sum / size;
      double var = square_sum / size - mean * mean;
 
      double gamma = gamma_data[channel];
      double beta = beta_data[channel];
 
      double a = gamma / (std::sqrt(var + params.epsilon));
      double b = -mean * a + beta;
 
      for (int32_t height = 0; height < heights; height++)
      {
        for (int32_t width = 0; width < widths; width++)
        {
          double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
          double output_value = input_value * a + b;
          output_data[Offset(output_shape, batch, height, width, channel)] =
            ActivationFunctionWithMinMax((float)output_value, output_activation_min,
                                         output_activation_max);
        }
      }
    }
  }
}

References ActivationFunctionWithMinMax(), nnfw::cker::InstanceNormParams::epsilon, nnfw::cker::InstanceNormParams::float_activation_max, nnfw::cker::InstanceNormParams::float_activation_min, MatchingDim(), Offset(), output_shape, and size.

◆ IsZeroVector()

bool nnfw::cker::IsZeroVector	(	const float *	vector,
		int	v_size
	)

inline

Definition at line 104 of file TensorUtils.h.

{
  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
}

References IsZeroVector(), and NEON_OR_PORTABLE.

Referenced by onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid(), FullyConnectedHybrid(), IsZeroVector(), LstmStepFloat(), and onert::backend::cpu::ops::FullyConnectedLayer::prepare().

◆ L2NormalizeFloat32()

void nnfw::cker::L2NormalizeFloat32	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 30 of file L2Normalize.h.

{
  float epsilon = 1e-6;
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  for (int i = 0; i < outer_size; ++i)
  {
    float squared_l2_norm = 0;
    for (int c = 0; c < depth; ++c)
    {
      const float val = input_data[c];
      squared_l2_norm += val * val;
    }
    float l2_norm = std::sqrt(squared_l2_norm);
    l2_norm = std::max(l2_norm, epsilon);
    for (int c = 0; c < depth; ++c)
    {
      *output_data = *input_data / l2_norm;
      ++output_data;
      ++input_data;
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), MatchingDim(), MatchingFlatSizeSkipDim(), and output_shape.

Referenced by onert::backend::cpu::ops::L2NormLayer::run().

◆ L2NormalizeQuant8()

void nnfw::cker::L2NormalizeQuant8	(	L2NormParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

Definition at line 56 of file L2Normalize.h.

{
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  const int32_t input_zero_point = params.input_zero_point;
 
  for (int i = 0; i < outer_size; ++i)
  {
    int32_t square_l2_norm = 0;
    for (int c = 0; c < depth; c++)
    {
      // Note that input_data advances by depth in the second pass below.
      int32_t diff = input_data[c] - input_zero_point;
      square_l2_norm += diff * diff;
    }
    int32_t inv_l2norm_multiplier;
    int inv_l2norm_shift;
    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
    for (int c = 0; c < depth; c++)
    {
      int32_t diff = *input_data - input_zero_point;
      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
      int32_t unclamped_output_val = 128 + rescaled_diff;
      int32_t output_val = std::min(static_cast<int32_t>(255),
                                    std::max(static_cast<int32_t>(0), unclamped_output_val));
      *output_data = static_cast<uint8_t>(output_val);
      ++input_data;
      ++output_data;
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), GetInvSqrtQuantizedMultiplierExp(), nnfw::cker::L2NormParams::input_zero_point, MatchingDim(), MatchingFlatSizeSkipDim(), MultiplyByQuantizedMultiplierSmallerThanOneExp(), and output_shape.

Referenced by onert::backend::cpu::ops::L2NormLayer::run().

◆ LeakyReLU()

void nnfw::cker::LeakyReLU	(	const LeakyReluParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 31 of file LeakyReLU.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  for (int i = 0; i < flat_size; i++)
  {
    const float val = input_data[i];
    // Note that alpha might be > 1 or < 0, so we don't use std::max here.
    output_data[i] = val > 0 ? val : val * params.alpha;
  }
}

References nnfw::cker::LeakyReluParams::alpha, MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ LessEqualFn()

template<typename T >

bool nnfw::cker::LessEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 35 of file Comparison.h.

35{ return lhs <= rhs; }

◆ LessFn()

template<typename T >

bool nnfw::cker::LessFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 34 of file Comparison.h.

34{ return lhs < rhs; }

◆ Log()

void nnfw::cker::Log	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 80 of file Elementwise.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = std::log(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ LogicalAndBroadcast()

template<typename T >

void nnfw::cker::LogicalAndBroadcast	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 30 of file LogicalAnd.h.

{
  assert(unextended_input1_shape.DimensionsCount() <= 4);
  assert(unextended_input2_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          auto out_idx = Offset(output_shape, b, y, x, c);
          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
          output_data[out_idx] = in1_val && in2_val;
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ LogicalAndElementwise()

template<typename T >

void nnfw::cker::LogicalAndElementwise	(	const Shape &	shape,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 65 of file LogicalAnd.h.

{
 
  int num_elements = shape.FlatSize();
 
  for (int t = 0; t < num_elements; t++)
  {
    output_data[t] = input1_data[t] && input2_data[t];
  }
}

References nnfw::cker::Shape::FlatSize().

◆ LogicalNot()

void nnfw::cker::LogicalNot	(	const Shape &	input_shape,
		const bool *	input_data,
		const Shape &	output_shape,
		bool *	output_data
	)

inline

Definition at line 28 of file LogicalNot.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = !input_data[i];
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ LogicalOrBroadcast()

template<typename T >

void nnfw::cker::LogicalOrBroadcast	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 30 of file LogicalOr.h.

{
  assert(unextended_input1_shape.DimensionsCount() <= 4);
  assert(unextended_input2_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          auto out_idx = Offset(output_shape, b, y, x, c);
          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
          output_data[out_idx] = in1_val || in2_val;
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ LogicalOrElementwise()

template<typename T >

void nnfw::cker::LogicalOrElementwise	(	const Shape &	shape,
		const T *	input1_data,
		const T *	input2_data,
		T *	output_data
	)

inline

Definition at line 65 of file LogicalOr.h.

{
 
  int num_elements = shape.FlatSize();
 
  for (int t = 0; t < num_elements; t++)
  {
    output_data[t] = input1_data[t] || input2_data[t];
  }
}

References nnfw::cker::Shape::FlatSize().

◆ Logistic()

void nnfw::cker::Logistic	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 32 of file Logistic.h.

{
  auto input_map = MapAsVector(input_data, input_shape);
  auto output_map = MapAsVector(output_data, output_shape);
 
  output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ LogSoftmax() [1/2]

void nnfw::cker::LogSoftmax	(	const SoftmaxParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 34 of file LogSoftMax.h.

{
  const int rank = input_shape.DimensionsCount();
  const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
  const double beta = params.beta;
  const int depth = MatchingDim(input_shape, axis, output_shape, axis);
 
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
 
  int inner_size = 1;
  for (int i = axis + 1; i < rank; ++i)
  {
    inner_size *= input_shape.Dims(i);
  }
 
  for (int i = 0; i < outer_size; ++i)
  {
    for (int j = 0; j < inner_size; ++j)
    {
      float max = std::numeric_limits<float>::lowest();
      for (int c = 0; c < depth; ++c)
      {
        max = std::max(max, input_data[(i * depth + c) * inner_size]);
      }
 
      float sum = 0.f;
      for (int c = 0; c < depth; ++c)
      {
        sum += std::exp((input_data[(i * depth + c) * inner_size + j] - max) * beta);
      }
 
      const float log_sum = std::log(sum);
      for (int c = 0; c < depth; ++c)
      {
        output_data[(i * depth + c) * inner_size + j] =
          (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
      }
    }
  }
}

References nnfw::cker::SoftmaxParams::axis, nnfw::cker::SoftmaxParams::beta, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), and output_shape.

Referenced by onert::backend::cpu::ops::LogSoftMaxLayer::logsoftmaxFloat32(), and onert::backend::cpu::ops::LogSoftMaxLayer::logsoftmaxQuant8().

◆ LogSoftmax() [2/2]

void nnfw::cker::LogSoftmax	(	const SoftmaxParams &	params,
		float	input_scale,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

inline

Definition at line 80 of file LogSoftMax.h.

{
  const int rank = input_shape.DimensionsCount();
  const int axis = (params.axis < 0) ? params.axis + rank : params.axis;
  const double beta = params.beta;
  const int depth = MatchingDim(input_shape, axis, output_shape, axis);
 
  const int32_t clamp_max = std::numeric_limits<uint8_t>::max();
  const int32_t clamp_min = std::numeric_limits<uint8_t>::min();
 
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
 
  int inner_size = 1;
  for (int i = axis + 1; i < rank; ++i)
  {
    inner_size *= input_shape.Dims(i);
  }
 
  for (int i = 0; i < outer_size; ++i)
  {
    for (int j = 0; j < inner_size; ++j)
    {
      uint8_t max_val = std::numeric_limits<uint8_t>::min();
      for (int c = 0; c < depth; ++c)
      {
        max_val = std::max(max_val, input_data[(i * depth + c) * inner_size]);
      }
 
      float sum_exp = 0.0f;
      const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
      const float *table_offset = &params.table[max_uint8 - max_val];
      for (int c = 0; c < depth; ++c)
      {
        sum_exp += table_offset[input_data[(i * depth + c) * inner_size]];
      }
      const float log_sum_exp = std::log(sum_exp);
 
      const float scale = input_scale / params.scale;
      const float precomputed = (input_scale * max_val * beta + log_sum_exp) / params.scale;
      for (int c = 0; c < depth; ++c)
      {
        const float log_prob =
          scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
        const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
        output_data[(i * depth + c) * inner_size] =
          static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
      }
    }
  }
}

References nnfw::cker::SoftmaxParams::axis, nnfw::cker::SoftmaxParams::beta, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), output_shape, nnfw::cker::SoftmaxParams::scale, nnfw::cker::SoftmaxParams::table, and nnfw::cker::SoftmaxParams::zero_point.

◆ LoopCondition()

bool nnfw::cker::LoopCondition	(	int	index,
		int	stop,
		int	stride
	)

inline

Definition at line 187 of file StridedSlice.h.

{
  // True when we have reached the end of an axis and should loop.
  return stride > 0 ? index >= stop : index <= stop;
}

Referenced by StridedSlice().

◆ LstmStepFloat()

void nnfw::cker::LstmStepFloat	(	const float *	input_ptr,
		const float *	input_to_input_weights_ptr,
		const float *	input_to_forget_weights_ptr,
		const float *	input_to_cell_weights_ptr,
		const float *	input_to_output_weights_ptr,
		const float *	aux_input_ptr,
		const float *	aux_input_to_input_weights_ptr,
		const float *	aux_input_to_forget_weights_ptr,
		const float *	aux_input_to_cell_weights_ptr,
		const float *	aux_input_to_output_weights_ptr,
		const float *	recurrent_to_input_weights_ptr,
		const float *	recurrent_to_forget_weights_ptr,
		const float *	recurrent_to_cell_weights_ptr,
		const float *	recurrent_to_output_weights_ptr,
		const float *	cell_to_input_weights_ptr,
		const float *	cell_to_forget_weights_ptr,
		const float *	cell_to_output_weights_ptr,
		const float *	input_layer_norm_coefficients_ptr,
		const float *	forget_layer_norm_coefficients_ptr,
		const float *	cell_layer_norm_coefficients_ptr,
		const float *	output_layer_norm_coefficients_ptr,
		const float *	input_gate_bias_ptr,
		const float *	forget_gate_bias_ptr,
		const float *	cell_gate_bias_ptr,
		const float *	output_gate_bias_ptr,
		const float *	projection_weights_ptr,
		const float *	projection_bias_ptr,
		const LSTMParams *	params,
		int	n_batch,
		int	n_cell,
		int	n_input,
		int	n_aux_input,
		int	n_output,
		int	output_batch_leading_dim,
		float *	output_state_ptr,
		float *	cell_state_ptr,
		float *	scratch0,
		float *	scratch1,
		float *	scratch2,
		float *	scratch3,
		float *	output_ptr
	)

inline

Definition at line 285 of file LSTM.h.

{
  // Since we have already checked that weights are all there or none, we can
  // check the existence of only one to the get the condition.
  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
 
  // Make named scratch buffers.
  float *input_gate_scratch = scratch0;
  float *forget_gate_scratch = scratch1;
  float *cell_gate_scratch = scratch2;
  float *output_gate_scratch = scratch3;
 
  // Check if inputs are all zeros so we can skip some computations.
  const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
  const bool is_aux_input_all_zeros =
    (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
  if (!use_cifg)
  {
    // Calculate the input gate. (If not CIFG.)
    CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
                           aux_input_to_input_weights_ptr, output_state_ptr,
                           recurrent_to_input_weights_ptr, cell_state_ptr,
                           cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
                           input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
                           /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
                           input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
  }
  // Calculate the forget gate.
  CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
                         aux_input_to_forget_weights_ptr, output_state_ptr,
                         recurrent_to_forget_weights_ptr, cell_state_ptr,
                         cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
                         forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
                         forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
  // Calculate the cell update gate.
  CalculateLstmGateFloat(
    input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
    output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
    /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
    n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
    is_input_all_zeros, is_aux_input_all_zeros);
  // Update the cell state.
  UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
                      cell_gate_scratch, use_cifg, params->cell_clip);
  // Calculate output gate.
  CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr,
                         aux_input_to_output_weights_ptr, output_state_ptr,
                         recurrent_to_output_weights_ptr, cell_state_ptr,
                         cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
                         output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
                         /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
                         output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
  // Update the output state.
  CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
                           params->activation, projection_weights_ptr, projection_bias_ptr,
                           params->proj_clip, output_state_ptr, scratch2);
  // Copy output state to the output. Note that the output's rows may not be
  // contiguous (output_batch_leading_dim != n_output).
  for (int b = 0; b < n_batch; b++)
  {
    std::copy_n(output_state_ptr + b * n_output, n_output,
                output_ptr + b * output_batch_leading_dim);
  }
}

References nnfw::cker::LSTMParams::activation, CalculateLstmGateFloat(), CalculateLstmOutputFloat(), nnfw::cker::LSTMParams::cell_clip, IsZeroVector(), kSigmoid, nnfw::cker::LSTMParams::proj_clip, and UpdateLstmCellFloat().

◆ MapAsMatrixWithLastDimAsRows()

template<typename Scalar >

MatrixMap< Scalar > nnfw::cker::MapAsMatrixWithLastDimAsRows	(	Scalar *	data,
		const Shape &	shape
	)

Definition at line 60 of file Utils.h.

{
  const int dims_count = shape.DimensionsCount();
  const int rows = shape.Dims(dims_count - 1);
  const int cols = FlatSizeSkipDim(shape, dims_count - 1);
  return MatrixMap<Scalar>(data, rows, cols);
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and FlatSizeSkipDim().

Referenced by nnfw::cker::train::AveragePool2DGrad(), AveragePool< float >(), nnfw::cker::train::CategoricalCrossEntropy(), nnfw::cker::train::CategoricalCrossEntropyGrad(), nnfw::cker::train::FullyConnectedBiasGrad(), nnfw::cker::train::MaxPool2D(), nnfw::cker::train::MaxPool2DGrad(), MaxPool< float >(), nnfw::cker::train::MeanGrad(), and Softmax().

◆ MapAsVector()

template<typename Scalar >

VectorMap< Scalar > nnfw::cker::MapAsVector	(	Scalar *	data,
		const Shape &	shape
	)

Definition at line 43 of file Utils.h.

{
  const int size = shape.FlatSize();
  return VectorMap<Scalar>(data, size, 1);
}

References nnfw::cker::Shape::FlatSize(), and size.

Referenced by Abs(), nnfw::cker::train::BinaryArithmeticGrad(), nnfw::cker::train::CategoricalCrossEntropy(), Logistic(), ReLU(), ReLU6(), nnfw::cker::train::ReLU6Grad(), nnfw::cker::train::ReLUGrad(), and Tanh().

◆ MatchingDim() [1/2]

int nnfw::cker::MatchingDim	(	const Shape &	shape1,
		int	index1,
		const Shape &	shape2,
		int	index2
	)

inline

Definition at line 220 of file Shape.h.

{
  assert(shape1.Dims(index1) == shape2.Dims(index2));
  return shape1.Dims(index1);
}

References nnfw::cker::Shape::Dims().

Referenced by AveragePool16(), nnfw::cker::train::AveragePool2DGrad(), AveragePool32(), AveragePool< float >(), AveragePool< int8_t >(), nnfw::cker::train::backpropFilter(), nnfw::cker::train::backpropInput(), Concatenation(), nnfw::cker::reference::Conv(), nnfw::cker::multithreaded::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::train::ConvFilterGrad(), nnfw::cker::train::ConvInputGrad(), nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::reference_integer_ops::DepthwiseConvHybridPerChannel(), nnfw::cker::optimized::DepthwiseConvImpl(), DepthwiseConvOp(), nnfw::cker::reference_integer_ops::DepthwiseConvPerChannel(), nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding(), nnfw::cker::optimized::DepthwiseConvWithRounding(), nnfw::cker::optimized::DilatedIm2col(), FullyConnected(), FullyConnectedSparseWeight16x1(), FullyConnectedSparseWeightRandom(), nnfw::cker::reference::HybridConvPerChannel(), nnfw::cker::optimized::Im2col(), InstanceNorm(), L2NormalizeFloat32(), L2NormalizeQuant8(), LogSoftmax(), LogSoftmax(), MatchingDim(), nnfw::cker::train::MaxPool2D(), nnfw::cker::train::MaxPool2DGrad(), MaxPool< float >(), MaxPool< uint8_t >(), RankOneSelect(), ResizeBilinear(), ResizeBilinear(), ResizeBilinear(), RmsNorm(), RoPE(), nnfw::cker::reference::Softmax(), Softmax(), SplitV(), TransposeConv(), and nnfw::cker::reference::TransposeImpl().

◆ MatchingDim() [2/2]

template<typename... Args>

int nnfw::cker::MatchingDim	(	const Shape &	shape1,
		int	index1,
		const Shape &	shape2,
		int	index2,
		Args...	args
	)

Definition at line 228 of file Shape.h.

{
  assert(shape1.Dims(index1) == shape2.Dims(index2));
  return MatchingDim(shape1, index1, args...);
}

References nnfw::cker::Shape::Dims(), and MatchingDim().

◆ MatchingElementsSize()

int nnfw::cker::MatchingElementsSize	(	const Shape &	shape,
		const Shape &	check_shape_0,
		const Shape &	check_shape_1
	)

inline

Definition at line 334 of file Shape.h.

{
  const int size_1 = shape.FlatSize();
  [[maybe_unused]] const int size_2 = check_shape_0.FlatSize();
  [[maybe_unused]] const int size_3 = check_shape_1.FlatSize();
  assert(size_1 == size_2);
  assert(size_2 == size_3);
  return size_1;
}

References nnfw::cker::Shape::FlatSize().

Referenced by nnfw::cker::optimized::Add(), nnfw::cker::optimized::Add(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::optimized::Div(), nnfw::cker::optimized::Mul(), nnfw::cker::optimized::Mul(), and nnfw::cker::optimized::Sub().

◆ MatchingFlatSize()

template<typename... Ts>

int nnfw::cker::MatchingFlatSize	(	const Shape &	shape,
		Ts...	check_shapes
	)

inline

Definition at line 298 of file Shape.h.

{
  UNUSED_ALL{check_shapes...};
  assert(checkMatching(shape, std::forward<Ts>(check_shapes)...));
  return shape.FlatSize();
}

References checkMatching(), and nnfw::cker::Shape::FlatSize().

Referenced by ComparisonImpl(), ComparisonWithScaling(), Cos(), Dequantize(), Dequantize(), Dequantize(), ELU(), Erf(), Exp(), Floor(), LeakyReLU(), Log(), LogicalNot(), Neg(), powImpl(), Quantize(), Quantize(), Quantize(), Quantize(), Round(), Rsqrt(), Select(), Sin(), Softmax(), nnfw::cker::train::SoftMaxGrad(), Sqrt(), and Square().

◆ MatchingFlatSizeSkipDim() [1/2]

int nnfw::cker::MatchingFlatSizeSkipDim	(	const Shape &	shape,
		int	skip_dim,
		const Shape &	check_shape_0
	)

inline

Definition at line 305 of file Shape.h.

{
  const int dims_count = shape.DimensionsCount();
  for (int i = 0; i < dims_count; ++i)
  {
    if (i != skip_dim)
    {
      assert(shape.Dims(i) == check_shape_0.Dims(i));
    }
  }
  return FlatSizeSkipDim(shape, skip_dim);
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and FlatSizeSkipDim().

Referenced by L2NormalizeFloat32(), L2NormalizeQuant8(), MatchingFlatSizeSkipDim(), RankOneSelect(), nnfw::cker::reference::Softmax(), and Softmax().

◆ MatchingFlatSizeSkipDim() [2/2]

int nnfw::cker::MatchingFlatSizeSkipDim	(	const Shape &	shape,
		int	skip_dim,
		const Shape &	check_shape_0,
		const Shape &	check_shape_1
	)

inline

Definition at line 319 of file Shape.h.

{
  const int dims_count = shape.DimensionsCount();
  for (int i = 0; i < dims_count; ++i)
  {
    if (i != skip_dim)
    {
      assert(shape.Dims(i) == check_shape_0.Dims(i));
    }
  }
  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and MatchingFlatSizeSkipDim().

◆ MatrixBandPart()

template<typename T >

void nnfw::cker::MatrixBandPart	(	const T	num_lower_diags,
		const T	num_upper_diags,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 30 of file MatrixBandPart.h.

{
  auto last_dim = input_shape.DimensionsCount() - 1;
 
  T batch_num = 1;
  for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++)
  {
    batch_num *= input_shape.Dims(dim);
  }
 
  const T row_num = input_shape.Dims(last_dim - 1);
  const T col_num = input_shape.Dims(last_dim);
 
  if (!(num_lower_diags <= row_num))
    throw std::runtime_error(
      "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
 
  if (!(num_upper_diags <= col_num))
    throw std::runtime_error(
      "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
 
  std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
 
  // reference code, without multithreading
  for (T batch = 0; batch < batch_num; ++batch)
  {
    for (T row = 0; row < row_num; ++row)
    {
      auto output = output_data + (batch * row_num * col_num + row * col_num);
      auto input = input_data + (batch * row_num * col_num + row * col_num);
 
      const T band_start =
        num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
      const T band_end = num_upper_diags < 0
                           ? col_num
                           : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
 
      for (T band_idx = band_start; band_idx < band_end; band_idx++)
      {
        output[band_idx] = input[band_idx];
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ MatrixBatchVectorMultiplyAccumulate() [1/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate	(	const float *	matrix,
		int	m_rows,
		int	m_cols,
		const float *	vector,
		int	n_batch,
		float *	result,
		int	result_stride
	)

inline

Definition at line 136 of file TensorUtils.h.

{
  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch,
                   result, result_stride);
}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

◆ MatrixBatchVectorMultiplyAccumulate() [2/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate	(	const int8_t *	matrix,
		const int	m_rows,
		const int	m_cols,
		const int8_t *	vector,
		const float *	scaling_factors,
		int	n_batch,
		float *	result,
		int	result_stride
	)

inline

Definition at line 127 of file TensorUtils.h.

{
  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector,
                   scaling_factors, n_batch, result, result_stride);
}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), MatrixBatchVectorMultiplyAccumulate(), MatrixBatchVectorMultiplyAccumulate(), and MatrixBatchVectorMultiplyAccumulate().

◆ MatrixBatchVectorMultiplyAccumulate() [3/3]

void nnfw::cker::MatrixBatchVectorMultiplyAccumulate	(	const int8_t *	matrix,
		const int	m_rows,
		const int	m_cols,
		const int8_t *	vectors,
		const float *	scaling_factors,
		int	n_batch,
		int32_t *	scratch,
		float *	result,
		int	result_stride,
		ruy::Context *	ruy_context
	)

inline

Definition at line 144 of file TensorUtils.h.

{
  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
                   scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
}

References MatrixBatchVectorMultiplyAccumulate(), and NEON_OR_PORTABLE.

◆ Max()

template<typename T >

void nnfw::cker::Max	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 82 of file MaxMin.h.

{
  MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
                                   input2_data, unextended_output_shape, output_data,
                                   MaximumOp::template op<T>);
}

◆ MaximumMinimumBroadcast4DSlow()

template<typename T , typename Op >

void nnfw::cker::MaximumMinimumBroadcast4DSlow	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data,
		Op	op
	)

inline

Definition at line 47 of file MaxMin.h.

{
  assert(unextended_input1_shape.DimensionsCount() <= 4);
  assert(unextended_input2_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  NdArrayDesc<4> desc1;
  NdArrayDesc<4> desc2;
  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
                                      &desc2);
 
  for (int b = 0; b < output_shape.Dims(0); ++b)
  {
    for (int y = 0; y < output_shape.Dims(1); ++y)
    {
      for (int x = 0; x < output_shape.Dims(2); ++x)
      {
        for (int c = 0; c < output_shape.Dims(3); ++c)
        {
          auto out_idx = Offset(output_shape, b, y, x, c);
          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
          auto in1_val = input1_data[in1_idx];
          auto in2_val = input2_data[in2_idx];
          output_data[out_idx] = op(in1_val, in2_val);
        }
      }
    }
  }
}

References desc1, desc2, nnfw::cker::Shape::DimensionsCount(), NdArrayDescsForElementwiseBroadcast(), Offset(), output_shape, and SubscriptToIndex().

◆ MaxPool()

template<typename T >

void nnfw::cker::MaxPool	(	const PoolParams &	,
		const Shape &	,
		const T *	,
		const Shape &	,
		T *
	)

Definition at line 34 of file MaxPool.h.

{
  static_assert(std::is_integral<T>::value || std::is_floating_point<T>::value,
                "cker::MaxPool : This function supports only integer or floating point");
  throw std::runtime_error("cker::MaxPool : Unsupported data type");
}

◆ MaxPool< float >()

template<>

void nnfw::cker::MaxPool< float >	(	const PoolParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 42 of file MaxPool.h.

{
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
  // Prefill the output to minimum representable float value
  out_mat.setConstant(std::numeric_limits<float>::lowest());
  for (int b = 0; b < batches; ++b)
  {
    for (int h = 0; h < input_height; ++h)
    {
      for (int w = 0; w < input_width; ++w)
      {
        // (h_start, h_end) * (w_start, w_end) is the range that the input
        // vector projects to.
        int hpad = h + params.padding_values.height;
        int wpad = w + params.padding_values.width;
        int h_start =
          (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
        int h_end = std::min(hpad / stride_height + 1, output_height);
        int w_start =
          (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
        int w_end = std::min(wpad / stride_width + 1, output_width);
        // compute elementwise sum
        for (int ph = h_start; ph < h_end; ++ph)
        {
          for (int pw = w_start; pw < w_end; ++pw)
          {
            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
            out_mat.col(out_offset) =
              out_mat.col(out_offset)
                .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
          }
        }
      }
    }
  }
  const int flat_size = output_shape.FlatSize();
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = ActivationFunctionWithMinMax(output_data[i], params.float_activation_min,
                                                  params.float_activation_max);
  }
}

References ActivationFunctionWithMinMax(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PoolParams::float_activation_max, nnfw::cker::PoolParams::float_activation_min, nnfw::cker::PaddingValues::height, MapAsMatrixWithLastDimAsRows(), MatchingDim(), NodeOffset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ MaxPool< uint8_t >()

template<>

void nnfw::cker::MaxPool< uint8_t >	(	const PoolParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

Definition at line 98 of file MaxPool.h.

{
 
  // Here, and in other pooling ops, in order to maintain locality of reference,
  // to minimize some recalculations, and to load into NEON vector registers, we
  // use an inner loop down the depth. Since depths can be large and hence we
  // would need arbitrarily large temporary storage, we divide the work up into
  // depth tranches just within the batch loop.
  static constexpr int kPoolingAccTrancheSize = 256;
 
  assert(params.quantized_activation_min <= params.quantized_activation_max);
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
  const int stride_height = params.stride_height;
  const int stride_width = params.stride_width;
 
  uint8_t acc[kPoolingAccTrancheSize];
  for (int batch = 0; batch < batches; ++batch)
  {
    // We proceed through the depth in tranches (see comment above). The
    // depth_base is the depth at the beginning of the tranche. The
    // tranche_depth is the depth dimension of the tranche.
    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
    {
      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
      for (int out_y = 0; out_y < output_height; ++out_y)
      {
        for (int out_x = 0; out_x < output_width; ++out_x)
        {
          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
          const int filter_x_start = std::max(0, -in_x_origin);
          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
          const int filter_y_start = std::max(0, -in_y_origin);
          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
          memset(acc, 0, tranche_depth * sizeof(acc[0]));
          const uint8_t *input_ptr =
            input_data + depth_base +
            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
          for (int fy = filter_y_start; fy < filter_y_end; fy++)
          {
            const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
            for (int fx = filter_x_start; fx < filter_x_end; fx++)
            {
              const uint8_t *input_channel_ptr = input_row_ptr;
              int channel = 0;
#ifdef USE_NEON
              for (; channel <= tranche_depth - 16; channel += 16)
              {
                uint8x16_t acc_reg = vld1q_u8(acc + channel);
                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
                input_channel_ptr += 16;
                acc_reg = vmaxq_u8(acc_reg, input_reg);
                vst1q_u8(acc + channel, acc_reg);
              }
 
              for (; channel <= tranche_depth - 8; channel += 8)
              {
                uint8x8_t acc_reg = vld1_u8(acc + channel);
                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
                input_channel_ptr += 8;
                acc_reg = vmax_u8(acc_reg, input_reg);
                vst1_u8(acc + channel, acc_reg);
              }
#endif
              for (; channel < tranche_depth; ++channel)
              {
                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
              }
              input_row_ptr += depth;
            }
          }
          uint8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
          int channel = 0;
#ifdef USE_NEON
          for (; channel <= tranche_depth - 16; channel += 16)
          {
            uint8x16_t a = vld1q_u8(acc + channel);
            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
            vst1q_u8(output_ptr + channel, a);
          }
          for (; channel <= tranche_depth - 8; channel += 8)
          {
            uint8x8_t a = vld1_u8(acc + channel);
            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
            vst1_u8(output_ptr + channel, a);
          }
#endif
          for (; channel < tranche_depth; ++channel)
          {
            uint8_t a = acc[channel];
            a = std::max<uint8_t>(a, params.quantized_activation_min);
            a = std::min<uint8_t>(a, params.quantized_activation_max);
            output_ptr[channel] = static_cast<uint8_t>(a);
          }
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PoolParams::filter_height, nnfw::cker::PoolParams::filter_width, nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::PoolParams::padding_values, nnfw::cker::PoolParams::quantized_activation_max, nnfw::cker::PoolParams::quantized_activation_min, nnfw::cker::PoolParams::stride_height, nnfw::cker::PoolParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ Mean()

template<typename In , typename Out >

void nnfw::cker::Mean	(	const Shape &	input_shape,
		const In *	input_data,
		const Shape &	output_shape,
		Out *	output_data,
		const std::vector< int > &	axes
	)

Definition at line 211 of file ReduceMean.h.

{
  assert(input_shape.DimensionsCount() > 0);
  ReduceMean m_obj;
  m_obj.ReduceOp<In, Out>(input_shape, input_data, output_shape, output_data, axes, true, (Out)0,
                          mean_reducer);
}

References nnfw::cker::Shape::DimensionsCount(), mean_reducer(), output_shape, and nnfw::cker::ReduceMean::ReduceOp().

Referenced by onert::backend::cpu::ops::MeanLayer::MeanFloat32().

◆ mean_reducer()

template<typename Out , typename In >

Out nnfw::cker::mean_reducer	(	const Out	data1,
		const In	data2,
		int	normalizer
	)

Definition at line 41 of file ReduceMean.h.

{
  return data1 + static_cast<Out>(data2) / normalizer;
}

Referenced by Mean().

◆ MeanAxis1And2()

template<typename In , typename Out >

void nnfw::cker::MeanAxis1And2	(	const Shape &	input_shape,
		const In *	input_data,
		const Shape &	output_shape,
		Out *	output_data
	)

Definition at line 233 of file ReduceMean.h.

{
  assert(input_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
 
  const int output_batch = output_shape.Dims(0);
  const int output_depth = output_shape.Dims(3);
 
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
 
  for (int out_b = 0; out_b < output_batch; ++out_b)
  {
    for (int out_d = 0; out_d < output_depth; ++out_d)
    {
      float value = 0;
      for (int in_h = 0; in_h < input_height; ++in_h)
      {
        for (int in_w = 0; in_w < input_width; ++in_w)
        {
          value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
        }
      }
      output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

Referenced by onert::backend::cpu::ops::MeanLayer::MeanFloat32().

◆ MeanQ8Asymm()

template<typename In , typename Out >

void nnfw::cker::MeanQ8Asymm	(	const Shape &	input_shape,
		const In *	input_data,
		float	input_scale,
		int32_t	input_offset,
		const Shape &	output_shape,
		Out *	output_data,
		float	output_scale,
		int32_t	output_offset,
		const std::vector< int > &	axes
	)

Definition at line 221 of file ReduceMean.h.

{
  assert(input_shape.DimensionsCount() > 0);
  ReduceMean m_obj;
  m_obj.ReduceOp<In, Out>(input_shape, input_data, input_scale, input_offset, output_shape,
                          output_data, output_scale, output_offset, axes, true, (Out)0,
                          sum_reducer);
}

References nnfw::cker::Shape::DimensionsCount(), output_shape, nnfw::cker::ReduceMean::ReduceOp(), and sum_reducer().

Referenced by onert::backend::cpu::ops::MeanLayer::MeanQuant8().

◆ MeanStddevNormalization()

void nnfw::cker::MeanStddevNormalization	(	const float *	input_vector,
		float *	output_vector,
		int	v_size,
		int	n_batch
	)

inline

Definition at line 154 of file TensorUtils.h.

{
  PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
}

References PortableMeanStddevNormalization().

Referenced by CalculateLstmGateFloat().

◆ Min()

template<typename T >

void nnfw::cker::Min	(	const Shape &	unextended_input1_shape,
		const T *	input1_data,
		const Shape &	unextended_input2_shape,
		const T *	input2_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 92 of file MaxMin.h.

{
  MaximumMinimumBroadcast4DSlow<T>(unextended_input1_shape, input1_data, unextended_input2_shape,
                                   input2_data, unextended_output_shape, output_data,
                                   MinimumOp::template op<T>);
}

◆ MultiplyByQuantizedMultiplier()

int32_t nnfw::cker::MultiplyByQuantizedMultiplier	(	int32_t	x,
		int32_t	quantized_multiplier,
		int	shift
	)

inline

Definition at line 96 of file Utils.h.

{
  int left_shift = shift > 0 ? shift : 0;
  int right_shift = shift > 0 ? 0 : -shift;
  return gemmlowp::RoundingDivideByPOT(
    gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
    right_shift);
}

Referenced by nnfw::cker::reference::Conv(), nnfw::cker::reference::Conv(), nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::reference_integer_ops::DepthwiseConvPerChannel(), FullyConnected(), nnfw::cker::optimized::MulElementwise(), nnfw::cker::optimized::MulElementwise(), nnfw::cker::optimized::MulSimpleBroadcast(), nnfw::cker::optimized::quant8_mul(), Quantize(), Requantize< int8_t, uint8_t >(), and Requantize< uint8_t, int8_t >().

◆ MultiplyByQuantizedMultiplierGreaterThanOne()

int32_t nnfw::cker::MultiplyByQuantizedMultiplierGreaterThanOne	(	int32_t	x,
		int32_t	quantized_multiplier,
		int	left_shift
	)

inline

Definition at line 105 of file Utils.h.

{
  return gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier);
}

◆ MultiplyByQuantizedMultiplierSmallerThanOneExp()

int32_t nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp	(	int32_t	x,
		int32_t	quantized_multiplier,
		int	left_shift
	)

inline

Definition at line 111 of file Utils.h.

{
  return gemmlowp::RoundingDivideByPOT(
    gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
}

Referenced by nnfw::cker::optimized::AddElementwise(), nnfw::cker::optimized::AddElementwise(), nnfw::cker::optimized::AddScalarBroadcast(), BroadcastComparison4DSlowWithScaling(), ComparisonWithScaling(), L2NormalizeQuant8(), and nnfw::cker::optimized::quant8_sum().

◆ MultithreadAlongBatches()

bool nnfw::cker::MultithreadAlongBatches	(	int	thread_count,
		int	batches
	)

inline

Definition at line 95 of file DepthwiseConv.h.

{
  assert(thread_count >= 2);
  // If there are fewer batch entries than the number of threads we want to use,
  // then better do intra-batch-entry multithreading.
  if (batches < thread_count)
  {
    return false;
  }
  // If there are at least 2 batch entries to be handed to each thread, then
  // it's safe to proceed with batch-wise multithreading: each thread will have
  // approximately equal number of batch entries to handle, so the load
  // balancing will be reasonable, and the amount to which the load is not
  // perfectly balanced will be offset by the inherent advantages of
  // batch-wise multithreading (each thread is more efficient thanks to working
  // on larger buffers with less boundary-handling overhead).
  if (batches >= 2 * thread_count)
  {
    return true;
  }
  // In the limit case were there are at least 1 but not much more than 1
  // batch entries per thread, it may be a good idea to do per-batch
  // multithreading if the number of batch entries is a multiple of the number
  // of threads, so that each thread will have the same number of batch entries
  // to process.
  return ((batches % thread_count) == 0);
}

Referenced by DepthwiseConv().

◆ NdArrayDescsForElementwiseBroadcast() [1/2]

template<int N>

void nnfw::cker::NdArrayDescsForElementwiseBroadcast	(	const Shape &	input0_shape,
		const Shape &	input1_shape,
		const Shape &	input2_shape,
		NdArrayDesc< N > *	desc0_out,
		NdArrayDesc< N > *	desc1_out,
		NdArrayDesc< N > *	desc2_out
	)

inline

Definition at line 329 of file Utils.h.

{
  assert(desc0_out != nullptr);
  assert(desc1_out != nullptr);
  assert(desc2_out != nullptr);
 
  auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
  auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
  auto extended_input2_shape = Shape::ExtendedShape(N, input2_shape);
 
  // Copy dims to desc, calculating strides.
  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
  CopyDimsToDesc<N>(extended_input2_shape, desc2_out);
 
  // Walk over each dimension. If the extents are equal do nothing.
  // Otherwise, set the desc with extent 1 to have extent equal to the other and
  // stride 0.
  for (int i = 0; i < N; ++i)
  {
    const int extent0 = extended_input0_shape.Dims(i);
    const int extent1 = extended_input1_shape.Dims(i);
    const int extent2 = extended_input2_shape.Dims(i);
 
    int extent = extent0;
    if (extent1 != 1)
      extent = extent1;
    if (extent2 != 1)
      extent = extent2;
 
    assert(extent0 == 1 || extent0 == extent);
    assert(extent1 == 1 || extent1 == extent);
    assert(extent2 == 1 || extent2 == extent);
 
    if (!(extent0 == extent1 && extent1 == extent2))
    {
      if (extent0 == 1)
      {
        desc0_out->strides[i] = 0;
        desc0_out->extents[i] = extent;
      }
      if (extent1 == 1)
      {
        desc1_out->strides[i] = 0;
        desc1_out->extents[i] = extent;
      }
      if (extent2 == 1)
      {
        desc2_out->strides[i] = 0;
        desc2_out->extents[i] = extent;
      }
    }
  }
}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

◆ NdArrayDescsForElementwiseBroadcast() [2/2]

template<int N>

void nnfw::cker::NdArrayDescsForElementwiseBroadcast	(	const Shape &	input0_shape,
		const Shape &	input1_shape,
		NdArrayDesc< N > *	desc0_out,
		NdArrayDesc< N > *	desc1_out
	)

inline

Definition at line 290 of file Utils.h.

{
  assert(desc0_out != nullptr);
  assert(desc1_out != nullptr);
 
  auto extended_input0_shape = Shape::ExtendedShape(N, input0_shape);
  auto extended_input1_shape = Shape::ExtendedShape(N, input1_shape);
 
  // Copy dims to desc, calculating strides.
  CopyDimsToDesc<N>(extended_input0_shape, desc0_out);
  CopyDimsToDesc<N>(extended_input1_shape, desc1_out);
 
  // Walk over each dimension. If the extents are equal do nothing.
  // Otherwise, set the desc with extent 1 to have extent equal to the other and
  // stride 0.
  for (int i = 0; i < N; ++i)
  {
    const int extent0 = extended_input0_shape.Dims(i);
    const int extent1 = extended_input1_shape.Dims(i);
    if (extent0 != extent1)
    {
      if (extent0 == 1)
      {
        desc0_out->strides[i] = 0;
        desc0_out->extents[i] = extent1;
      }
      else
      {
        assert(extent1 == 1);
        desc1_out->strides[i] = 0;
        desc1_out->extents[i] = extent0;
      }
    }
  }
}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

Referenced by nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), BroadcastComparison4DSlowImpl(), BroadcastComparison4DSlowWithScaling(), BroadcastSelect4DSlow(), FloorDivBroadcast(), FloorModBroadcast(), LogicalAndBroadcast(), LogicalOrBroadcast(), and MaximumMinimumBroadcast4DSlow().

◆ Neg()

template<typename T >

void nnfw::cker::Neg	(	const Shape &	input_shape,
		const T *	input_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 70 of file Elementwise.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = -input_data[i];
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ NextIndex()

bool nnfw::cker::NextIndex	(	const int	num_dims,
		const int *	dims,
		int *	current
	)

inline

Definition at line 387 of file Utils.h.

{
  if (num_dims == 0)
  {
    return false;
  }
  assert(dims != nullptr);
  assert(current != nullptr);
  int carry = 1;
  for (int idx = num_dims - 1; idx >= 0; --idx)
  {
    int current_val = current[idx] + carry;
    assert(dims[idx] >= current_val);
    if (dims[idx] == current_val)
    {
      current[idx] = 0;
    }
    else
    {
      current[idx] = current_val;
      carry = 0;
      break;
    }
  }
  return (carry == 0);
}

Referenced by ReduceImpl(), ReduceMeanImpl(), ReduceSumQuantImpl(), and SqDiffImpl().

◆ NodeOffset()

int nnfw::cker::NodeOffset	(	int	b,
		int	h,
		int	w,
		int	height,
		int	width
	)

inline

Definition at line 147 of file Utils.h.

{
  return (b * height + h) * width + w;
}

Referenced by nnfw::cker::train::AveragePool2DGrad(), AveragePool< float >(), nnfw::cker::train::MaxPool2D(), and MaxPool< float >().

◆ NotEqualFn()

template<typename T >

bool nnfw::cker::NotEqualFn	(	T	lhs,
		T	rhs
	)

inline

Definition at line 31 of file Comparison.h.

31{ return lhs != rhs; }

◆ Offset() [1/2]

int nnfw::cker::Offset	(	const Shape &	shape,
		int *	index
	)

inline

Definition at line 248 of file Shape.h.

{
  return Offset(shape, index[0], index[1], index[2], index[3]);
}

References Offset().

◆ Offset() [2/2]

int nnfw::cker::Offset	(	const Shape &	shape,
		int	i0,
		int	i1,
		int	i2,
		int	i3
	)

inline

Definition at line 237 of file Shape.h.

{
  assert(shape.DimensionsCount() == 4);
  const int *dims_data = shape.DimsDataUpTo4D();
  assert(i0 >= 0 && i0 < dims_data[0]);
  assert(i1 >= 0 && i1 < dims_data[1]);
  assert(i2 >= 0 && i2 < dims_data[2]);
  assert(i3 >= 0 && i3 < dims_data[3]);
  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::DimsDataUpTo4D().

◆ OneHot()

template<typename T , typename TI >

void nnfw::cker::OneHot	(	const int32_t	depth,
		const T	on_value,
		const T	off_value,
		int32_t	axis,
		const Shape &	indices_shape,
		const TI *	indices_data,
		const Shape &	,
		T *	output_data
	)

Definition at line 29 of file OneHot.h.

{
  if (axis == -1)
    axis = indices_shape.DimensionsCount();
 
  // prefix_dim_size == # of elements before the axis
  // depth == # of elements per axis
  // suffix_dim_size == # of elements after the axis
  int prefix_dim_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    prefix_dim_size *= indices_shape.Dims(i);
  }
  const int suffix_dim_size = indices_shape.FlatSize() / prefix_dim_size;
 
  // View the indices as a matrix of size:
  //     prefix_dim_size x suffix_dim_size
  // View the output as a matrix of size:
  //     prefix_dim_size x depth x suffix_dim_size
  // Then the output is:
  //     output(i, j, k) == (indices(i, k) == j) ? on : off
  for (int i = 0; i < prefix_dim_size; ++i)
  {
    for (int j = 0; j < depth; ++j)
    {
      for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
      {
        *output_data =
          static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and nnfw::cker::Shape::FlatSize().

◆ operator<<()

std::ostream & nnfw::cker::operator<<	(	std::ostream &	os,
		const Shape &	shape
	)

inline

Definition at line 486 of file Utils.h.

{
  using std::begin;
  using std::end;
 
  std::string formatted =
    std::accumulate(begin(shape), end(shape), std::string{"["},
                    [](std::string joined, ShapeIterator::value_type dim) {
                      return std::move(joined).append(std::to_string(dim)).append(",");
                    });
 
  if (formatted.back() == '[')
  {
    formatted.push_back(']');
  }
  else
  {
    formatted.back() = ']';
  }
 
  os << formatted;
  return os;
}

References begin, and end().

◆ optimized_ops_preload_l1_keep()

template<typename T >

void nnfw::cker::optimized_ops_preload_l1_keep ( const T * ptr )

Definition at line 455 of file Utils.h.

{
#ifdef __GNUC__
  // builtin offered by GCC-compatible compilers including clang
  __builtin_prefetch(ptr, /* 0 means read */ 0, /* 3 means high locality */ 3);
#else
  (void)ptr;
#endif
}

Referenced by Transpose2D().

◆ Pack()

template<typename Scalar >

void nnfw::cker::Pack	(	const PackParams &	params,
		const Scalar const	input_data,
		const Shape &	output_shape,
		Scalar *	output_data
	)

inline

Definition at line 30 of file Pack.h.

{
  const int dimensions = output_shape.DimensionsCount();
  int axis = params.axis;
  int inputs_count = params.inputs_count;
 
  int outer_size = 1;
  for (int i = 0; i < axis; i++)
  {
    outer_size *= output_shape.Dims(i);
  }
  int copy_size = 1;
  for (int i = params.axis + 1; i < dimensions; i++)
  {
    copy_size *= output_shape.Dims(i);
  }
 
  for (int i = 0; i < inputs_count; ++i)
  {
    for (int k = 0; k < outer_size; k++)
    {
      const Scalar *input_ptr = input_data[i] + copy_size * k;
      int loc = k * inputs_count * copy_size + i * copy_size;
      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
    }
  }
}

References nnfw::cker::PackParams::axis, nnfw::cker::PackParams::inputs_count, and output_shape.

◆ Pad()

template<typename T >

void nnfw::cker::Pad	(	const int32_t *	padding_data,
		int32_t	pad_rank,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	output_shape,
		T *	output_data,
		const T *	constant_value_data
	)

inline

List of padding information

Definition at line 30 of file Pad.h.

{
  // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
  // TODO: come up with more subtle solution that uses subtensors like arm compute
  // TODO: Check if it works for all layouts
 
  using PaddingInfo = std::pair<int32_t, int32_t>;
  using PaddingList = std::vector<PaddingInfo>;
 
  const T constant_value = constant_value_data ? *constant_value_data : 0;
  assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
 
  PaddingList padding_list(pad_rank);
  for (int32_t n = 0; n < pad_rank; ++n)
  {
    const int32_t *from = padding_data + (n * 2);
    padding_list[n] = {from[0], from[1]};
  }
  for (int32_t i = 0; i < pad_rank; ++i)
  {
    assert(output_shape.Dims(i) ==
           input_shape.Dims(i) + padding_list[i].first + padding_list[i].second);
  }
  /* Use pad_rank since given input/output shapes are expanded to 4d before calling all cker
     functions:
     1. to prevent access violation in padding_list;
     2. handling as 4d is slower than as 2d/3d.
  */
  switch (pad_rank)
  {
    case 0:
    case 1:
    {
      const int32_t in_row_len = input_shape.Dims(0);
      [[maybe_unused]] auto [pad_before, pad_after] = padding_list[0];
      std::fill_n(output_data, pad_before, constant_value);
      std::memcpy(output_data + pad_before, input_data, in_row_len * sizeof(T));
      std::fill_n(output_data + pad_before + in_row_len, pad_after, constant_value);
      break;
    }
    case 2: // HW
    {
      const int32_t in_row_len = input_shape.Dims(1);
      const int32_t out_row_size = output_shape.Dims(1);
 
      auto [pad_top, pad_bottom] = padding_list[0];
      auto [pad_left, pad_right] = padding_list[1];
 
      // Prepend padding rows
      std::fill_n(output_data, pad_top * out_row_size, constant_value);
 
      const auto r_h_inp_lim = input_shape.Dims(0) + pad_top;
      for (auto i = pad_top, j = 0; i < r_h_inp_lim; ++i, ++j)
      {
        auto out_offset = i * out_row_size;
        const auto in_offset = j * in_row_len;
 
        // Prepend padding values
        std::fill_n(output_data + out_offset, pad_left, constant_value);
        out_offset += pad_left;
 
        // Copy a row of input data
        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
        out_offset += in_row_len;
 
        // Append padding values
        std::fill_n(output_data + out_offset, pad_right, constant_value);
      }
 
      // Append padding rows
      std::fill_n(output_data + r_h_inp_lim * out_row_size, pad_bottom * out_row_size,
                  constant_value);
      break;
    }
    case 3: // HWC
    {
      const int32_t in_row_len = input_shape.Dims(2);
      const int32_t out_row_size = output_shape.Dims(2);
      const auto plain_size = out_row_size * output_shape.Dims(1);
 
      auto [pad_batches_before, pad_batches_after] = padding_list[0];
      auto [pad_parallelepipes_before, pad_parallelepipes_after] = padding_list[1];
      auto [pad_plains_before, pad_plains_after] = padding_list[2];
 
      // Prepend padding plains
      std::fill_n(output_data, pad_batches_before * plain_size, constant_value);
 
      const auto r_h_inp_lim = input_shape.Dims(0) + pad_batches_before;
      for (auto i = pad_batches_before, i_inp = 0; i < r_h_inp_lim; ++i, ++i_inp)
      {
        const auto out_w_offset = (i * output_shape.Dims(1)) * output_shape.Dims(2);
 
        // Prepend padding rows
        std::fill_n(output_data + out_w_offset, pad_parallelepipes_before * out_row_size,
                    constant_value);
 
        const auto r_w_inp_lim = input_shape.Dims(1) + pad_parallelepipes_before;
        for (auto j = pad_parallelepipes_before, j_inp = 0; j < r_w_inp_lim; ++j, ++j_inp)
        {
          auto out_offset = (i * output_shape.Dims(1) + j) * output_shape.Dims(2);
          const auto in_offset = (i_inp * input_shape.Dims(1) + j_inp) * input_shape.Dims(2);
 
          // Prepend padding values
          std::fill_n(output_data + out_offset, pad_plains_before, constant_value);
          out_offset += pad_plains_before;
 
          // Copy a row of input data
          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
          out_offset += in_row_len;
 
          // Append padding values
          std::fill_n(output_data + out_offset, pad_plains_after, constant_value);
        }
 
        // Append padding rows
        std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
                    pad_parallelepipes_after * out_row_size, constant_value);
      }
 
      // Append padding plains
      std::fill_n(output_data + r_h_inp_lim * plain_size, pad_batches_after * plain_size,
                  constant_value);
      break;
    }
    case 4:
    {
      auto get_offset = [](const Shape &shape, int32_t n, int32_t h, int32_t w) -> int32_t {
        return ((n * shape.Dims(1) + h) * shape.Dims(2) + w) * shape.Dims(3);
      };
      const int32_t in_row_len = input_shape.Dims(3);
      const int32_t out_row_size = output_shape.Dims(3);
      const auto plain_size = out_row_size * output_shape.Dims(2);
      const auto parallelepiped_size = plain_size * output_shape.Dims(1);
 
      auto [pad_batches_before, pad_batches_after] = padding_list[0];
      auto [pad_parallelepipes_before, pad_parallelepipes_after] = padding_list[1];
      auto [pad_plains_before, pad_plains_after] = padding_list[2];
      auto [pad_rows_before, pad_rows_after] = padding_list[3];
 
      // Prepend padding parallelepipeds
      std::fill_n(output_data, pad_batches_before * parallelepiped_size, constant_value);
 
      const auto r_b_inp_lim = input_shape.Dims(0) + pad_batches_before;
      for (auto i = pad_batches_before, i_inp = 0; i < r_b_inp_lim; ++i, ++i_inp)
      {
        const auto out_h_offset = get_offset(output_shape, i, 0, 0);
        // Prepend padding plains
        std::fill_n(output_data + out_h_offset, pad_parallelepipes_before * plain_size,
                    constant_value);
 
        const auto r_h_inp_lim = input_shape.Dims(1) + pad_parallelepipes_before;
        for (auto j = pad_parallelepipes_before, j_inp = 0; j < r_h_inp_lim; ++j, ++j_inp)
        {
          const auto out_w_offset = get_offset(output_shape, i, j, 0);
 
          // Prepend padding rows
          std::fill_n(output_data + out_w_offset, pad_plains_before * out_row_size, constant_value);
 
          const auto r_w_inp_lim = input_shape.Dims(2) + pad_plains_before;
          for (auto k = pad_plains_before, k_inp = 0; k < r_w_inp_lim; ++k, ++k_inp)
          {
            auto out_c_offset = get_offset(output_shape, i, j, k);
            const auto in_offset = get_offset(input_shape, i_inp, j_inp, k_inp);
 
            // Prepend padding values
            std::fill_n(output_data + out_c_offset, pad_rows_before, constant_value);
            out_c_offset += pad_rows_before;
 
            // Copy a row of input data
            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
            out_c_offset += in_row_len;
 
            // Append padding values
            std::fill_n(output_data + out_c_offset, pad_rows_after, constant_value);
          }
 
          // Append padding rows
          std::fill_n(output_data + out_w_offset + r_w_inp_lim * out_row_size,
                      pad_plains_after * out_row_size, constant_value);
        }
 
        // Append padding plains
        std::fill_n(output_data + out_h_offset + r_h_inp_lim * plain_size,
                    pad_parallelepipes_after * plain_size, constant_value);
      }
 
      // Append padding parallelepipeds
      std::fill_n(output_data + r_b_inp_lim * parallelepiped_size,
                  pad_batches_after * parallelepiped_size, constant_value);
      break;
      break;
    }
    default:
      throw std::runtime_error("Padding for rank > 4 NYI");
      break;
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and output_shape.

◆ PopulateSoftmaxLookupTable()

void nnfw::cker::PopulateSoftmaxLookupTable	(	float *	table,
		float	input_scale,
		float	beta
	)

inline

Definition at line 148 of file SoftMax.h.

{
  const float scale = -input_scale * beta;
  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
  for (int32_t val = 0; val <= max_uint8; ++val)
  {
    table[max_uint8 - val] = expf(scale * val);
  }
}

Referenced by onert::backend::cpu::ops::SoftMaxLayer::configure().

◆ PortableApplyActivationToVector()

void nnfw::cker::PortableApplyActivationToVector	(	const float *	vector,
		int	v_size,
		FusedActivationFunctionType	activation,
		float *	result
	)

inline

Definition at line 103 of file PortableTensorUtils.h.

{
  auto activation_func = ActivationFunctor(activation);
  for (int v = 0; v < v_size; v++)
  {
    *result++ = (activation_func)(*vector++);
  }
}

Referenced by ApplyActivationToVector().

◆ PortableAsymmetricQuantizeFloats()

void nnfw::cker::PortableAsymmetricQuantizeFloats	(	const float *	values,
		const int	size,
		int8_t *	quantized_values,
		float *	scaling_factor,
		int32_t *	offset
	)

inline

Definition at line 147 of file PortableTensorUtils.h.

{
  /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */
  const int32_t kMinScale = -128;
  const int32_t kMaxScale = 127;
  const double qmin_double = kMinScale;
  const double qmax_double = kMaxScale;
  const auto [min_ptr, max_ptr] = std::minmax_element(values, values + size);
  const double rmin = static_cast<double>(std::min(0.0f, *min_ptr));
  const double rmax = static_cast<double>(std::max(0.0f, *max_ptr));
  if (rmin == rmax)
  {
    memset(quantized_values, 0, size * sizeof(int8_t));
    *scaling_factor = 1;
    *offset = 0;
    return;
  }
  else
  {
    double scale = (rmax - rmin) / (qmax_double - qmin_double);
    const double zero_point_from_min = qmin_double - rmin / scale;
    const double zero_point_from_max = qmax_double - rmax / scale;
    const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale);
    const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale);
    const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
                                       ? zero_point_from_min
                                       : zero_point_from_max;
    int8_t nudged_zero_point = 0;
    if (zero_point_double <= qmin_double)
    {
      nudged_zero_point = kMinScale;
    }
    else if (zero_point_double >= qmax_double)
    {
      nudged_zero_point = kMaxScale;
    }
    else
    {
      nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
    }
    *scaling_factor = scale;
    *offset = nudged_zero_point;
  }
  const float scaling_factor_inv = 1.0f / *scaling_factor;
  for (int i = 0; i < size; ++i)
  {
    const int32_t quantized_value =
      static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv));
    quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
  }
}

References offset(), and size.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convQ8iHybridPerChannel().

◆ PortableCwiseClipping()

template<typename T >

void nnfw::cker::PortableCwiseClipping	(	T *	vector,
		const int	v_size,
		const T	clipping_value
	)

Definition at line 63 of file PortableTensorUtils.h.

{
  for (int i = 0; i < v_size; i++)
  {
    vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
  }
}

◆ PortableIsZeroVector()

bool nnfw::cker::PortableIsZeroVector	(	const float *	vector,
		int	v_size
	)

inline

Definition at line 93 of file PortableTensorUtils.h.

{
  for (int i = 0; i < v_size; ++i)
  {
    if (*vector++ != 0.0f)
      return false;
  }
  return true;
}

◆ PortableMatrixBatchVectorMultiplyAccumulate() [1/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate	(	const float *	matrix,
		int	m_rows,
		int	m_cols,
		const float *	vector,
		int	n_batch,
		float *	result,
		int	result_stride
	)

inline

Definition at line 242 of file PortableTensorUtils.h.

{
  float *result_in_batch = result;
  for (int b = 0; b < n_batch; b++)
  {
    const float *matrix_ptr = matrix;
    for (int r = 0; r < m_rows; r++)
    {
      float dot_prod = 0.0f;
      const float *vector_in_batch = vector + b * m_cols;
      for (int c = 0; c < m_cols; c++)
      {
        dot_prod += *matrix_ptr++ * *vector_in_batch++;
      }
      *result_in_batch += dot_prod;
      result_in_batch += result_stride;
    }
  }
}

◆ PortableMatrixBatchVectorMultiplyAccumulate() [2/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate	(	const int8_t *__restrict__	matrix,
		const int	m_rows,
		const int	m_cols,
		const int8_t *__restrict__	vector,
		const float *	scaling_factors,
		int	n_batch,
		int32_t *	,
		float *__restrict__	result,
		int	result_stride,
		ruy::Context *
	)

inline

Definition at line 231 of file PortableTensorUtils.h.

{
  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
                                              n_batch, result, result_stride);
}

References PortableMatrixBatchVectorMultiplyAccumulate().

◆ PortableMatrixBatchVectorMultiplyAccumulate() [3/3]

void nnfw::cker::PortableMatrixBatchVectorMultiplyAccumulate	(	const int8_t *__restrict__	matrix,
		const int	m_rows,
		const int	m_cols,
		const int8_t *__restrict__	vectors,
		const float *	scaling_factors,
		int	n_batch,
		float *__restrict__	result,
		int	result_stride
	)

inline

Definition at line 201 of file PortableTensorUtils.h.

{
  int batch, row, col;
  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
  {
    const float batch_scaling_factor = scaling_factors[batch];
    // Get the address of the first row.
    const int8_t *row_ptr = matrix;
    for (row = 0; row < m_rows; ++row, result += result_stride)
    {
      // Initialize the dot product sum for the row to 0.
      int32_t dotprod = 0;
#if defined(__GNUC__)
      // Prefetch the row to cache.
      __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
#endif
      for (col = 0; col < m_cols; ++col, ++row_ptr)
      {
        dotprod += (*row_ptr) * (vectors[col]);
      } // for col
      *result += (dotprod * batch_scaling_factor);
    } // for row
  }   // for batch
}

Referenced by PortableMatrixBatchVectorMultiplyAccumulate().

◆ PortableMeanStddevNormalization()

void nnfw::cker::PortableMeanStddevNormalization	(	const float *	input_vector,
		float *	output_vector,
		int	v_size,
		int	n_batch
	)

inline

Definition at line 264 of file PortableTensorUtils.h.

{
  for (int batch = 0; batch < n_batch; ++batch)
  {
    float sum = 0.0f;
    for (int i = 0; i < v_size; ++i)
    {
      sum += input_vector[i];
    }
    const float mean = sum / v_size;
    float sum_diff_sq = 0.0f;
    for (int i = 0; i < v_size; ++i)
    {
      const float diff = input_vector[i] - mean;
      sum_diff_sq += diff * diff;
    }
    const float variance = sum_diff_sq / v_size;
    constexpr float kNormalizationConstant = 1e-8f;
    const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
    for (int i = 0; i < v_size; ++i)
    {
      output_vector[i] = (input_vector[i] - mean) * stddev_inv;
    }
    input_vector += v_size;
    output_vector += v_size;
  }
}

Referenced by MeanStddevNormalization().

◆ PortableSub1Vector()

void nnfw::cker::PortableSub1Vector	(	const float *	vector,
		int	v_size,
		float *	result
	)

inline

Definition at line 113 of file PortableTensorUtils.h.

{
  for (int v = 0; v < v_size; v++)
  {
    *result++ = 1.0f - *vector++;
  }
}

◆ PortableSymmetricQuantizeFloats()

void nnfw::cker::PortableSymmetricQuantizeFloats	(	const float *	values,
		const int	size,
		int8_t *	quantized_values,
		float *	min_value,
		float *	max_value,
		float *	scaling_factor
	)

inline

Definition at line 121 of file PortableTensorUtils.h.

{
  auto [min_ptr, max_ptr] = std::minmax_element(values, values + size);
  *min_value = *min_ptr;
  *max_value = *max_ptr;
  const int kScale = 127;
  const float range = std::max(std::abs(*min_value), std::abs(*max_value));
  if (range == 0)
  {
    memset(quantized_values, 0, size * sizeof(int8_t));
    *scaling_factor = 1;
    return;
  }
  *scaling_factor = range / kScale;
  const float scaling_factor_inv = kScale / range;
  for (int i = 0; i < size; ++i)
  {
    const int32_t quantized_value =
      static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
    // Clamp: just in case some odd numeric offset.
    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
  }
}

References size.

◆ PortableVectorBatchVectorAdd()

void nnfw::cker::PortableVectorBatchVectorAdd	(	const float *	vector,
		int	v_size,
		int	n_batch,
		float *	batch_vector
	)

inline

Definition at line 80 of file PortableTensorUtils.h.

{
  for (int b = 0; b < n_batch; b++)
  {
    for (int i = 0; i < v_size; ++i)
    {
      batch_vector[i] += vector[i];
    }
    batch_vector += v_size;
  }
}

Referenced by VectorBatchVectorAdd().

◆ PortableVectorBatchVectorAssign()

void nnfw::cker::PortableVectorBatchVectorAssign	(	const float *	vector,
		int	v_size,
		int	n_batch,
		float *	batch_vector
	)

inline

Definition at line 71 of file PortableTensorUtils.h.

{
  for (int b = 0; b < n_batch; b++)
  {
    memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
  }
}

Referenced by VectorBatchVectorAssign().

◆ PortableZeroVector()

void nnfw::cker::PortableZeroVector	(	float *	vector,
		int	v_size
	)

inline

Definition at line 293 of file PortableTensorUtils.h.

293{ std::fill_n(vector, v_size, 0); }

Referenced by ZeroVector().

◆ powImpl()

template<typename T >

void nnfw::cker::powImpl	(	const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 31 of file Pow.h.

{
  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
  for (int i = 0; i < flat_size; ++i)
  {
    output_data[i] = std::pow(input1_data[i], input2_data[i]);
  }
}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::PowLayer::powFloat32().

◆ ProcessBroadcastShapes()

bool nnfw::cker::ProcessBroadcastShapes	(	const Shape &	shape0,
		const Shape &	shape1,
		BinaryArithmeticOpParam *	params
	)

inline

Definition at line 107 of file BinaryArithmeticOps.h.

{
  const int dims_count = std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
 
  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
  Shape scalar_shape(dims_count, 1);
 
  auto extended_shape0 = Shape::ExtendedShape(dims_count, shape0);
  auto extended_shape1 = Shape::ExtendedShape(dims_count, shape1);
 
  // Check for "exact" match, implicitly accepting any scalar shapes.
  if (extended_shape0 == extended_shape1)
  {
    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
    return false;
  }
 
  for (int i = dims_count - 1; i >= 0; --i)
  {
    if (extended_shape0.Dims(i) == extended_shape1.Dims(i))
    {
      continue;
    }
    else if (extended_shape0.Dims(i) == 1)
    {
      params->broadcast_category = BroadcastableOpCategory::kFirstInputBroadcastsFast;
      break;
    }
    else if (extended_shape1.Dims(i) == 1)
    {
      params->broadcast_category = BroadcastableOpCategory::kSecondInputBroadcastsFast;
      break;
    }
    else
    {
      // This case is erroneous: there is a dimension that does not match and
      // is not a broadcast from one shape to the other.
      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
      return true;
    }
  }
 
  if (params->broadcast_category != BroadcastableOpCategory::kFirstInputBroadcastsFast &&
      params->broadcast_category != BroadcastableOpCategory::kSecondInputBroadcastsFast)
  {
    return false;
  }
 
  // From this point it is assumed contractually that corresponding dimensions
  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
  const bool swap_inputs =
    params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
  const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
  const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
 
  int i = dims_count - 1;
  params->broadcast_shape[0] = 1;
  params->broadcast_shape[1] = 1;
  params->broadcast_shape[2] = 1;
  params->broadcast_shape[3] = 1;
  params->broadcast_shape[4] = 1;
  // y_0 is greedy: include dims if both or neither equal 1: in other words,
  // test for equality rather than (shape_a->Dims(i) != 1).
  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
  {
    params->broadcast_shape[4] *= shape_b->Dims(i);
    --i;
  }
  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
  // that has the unit dimension, the next two loops are not entered.
  while (i >= 0 && shape_a->Dims(i) == 1)
  {
    params->broadcast_shape[3] *= shape_b->Dims(i);
    --i;
  }
  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
  {
    params->broadcast_shape[2] *= shape_a->Dims(i);
    --i;
  }
  // Here either input_a or input_b has dim of 1 (if i >= 0).
  while (i >= 0 && shape_b->Dims(i) == 1)
  {
    params->broadcast_shape[1] *= shape_a->Dims(i);
    --i;
  }
  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i))
  {
    params->broadcast_shape[0] *= shape_b->Dims(i);
    --i;
  }
 
  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
  // loop.
  if (i >= 0)
  {
    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
  }
  return true;
}

References nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::BinaryArithmeticOpParam::broadcast_shape, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), kFirstInputBroadcastsFast, kGenericBroadcast, kNonBroadcast, and kSecondInputBroadcastsFast.

◆ Quantize() [1/5]

void nnfw::cker::Quantize	(	const int32_t *	multiplier,
		const int32_t *	shift,
		int32_t	channel_size,
		int32_t	total_size,
		int32_t	output_zp,
		int32_t	output_min,
		int32_t	output_max,
		int32_t *	scratch,
		int8_t *	output
	)

inline

Definition at line 207 of file Quantize.h.

{
  // Here we're trying to quantize the raw accumulators:
  //        output_channels
  //       data data data data data
  // rows  data data data data data
  //       data data data data data
  //          ....
  //
  // In order to minimize the reload of the multipliers & shifts, once we load
  // the multipliers & shifts, we load & quantize the raw accumulators for every
  // row.
#ifdef USE_NEON
  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
  const int32x4_t zeros = vdupq_n_s32(0);
#endif
 
  assert(total_size % channel_size == 0);
  const int32_t rows = total_size / channel_size;
 
  int c = 0;
 
#ifdef USE_NEON
  using gemmlowp::RoundingDivideByPOT;
  for (; c <= channel_size - 8; c += 8)
  {
    int32x4_t out_shift_1 = vld1q_s32(shift + c);
    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
 
    // Right shift will be performed as left shift with negative values.
    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
 
    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
    for (int n = 0; n < rows; ++n)
    {
      int loc = n * channel_size + c;
      int32x4_t acc_1 = vld1q_s32(scratch + loc);
      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
 
      // Saturating Rounding Doubling High Mul.
      acc_1 = vshlq_s32(acc_1, left_shift_1);
      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
      acc_2 = vshlq_s32(acc_2, left_shift_2);
      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
 
      // Rounding Dividing By POT.
      acc_1 = vrshlq_s32(acc_1, right_shift_1);
      acc_2 = vrshlq_s32(acc_2, right_shift_2);
 
      // Add the output offset.
      acc_1 = vaddq_s32(acc_1, output_offset_vec);
      acc_2 = vaddq_s32(acc_2, output_offset_vec);
 
      // Apply the activation function.
      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
 
      // Saturating cast to int8 and store to destination.
      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
      const int8x8_t res_s8 = vqmovn_s16(res_s16);
      vst1_s8(output + loc, res_s8);
    }
  }
 
#endif // USE_NEON
  // Handle leftover values, one by one. This is very slow.
  for (; c < channel_size; c++)
  {
    for (int n = 0; n < rows; ++n)
    {
      int loc = n * channel_size + c;
      int32_t acc = scratch[loc];
      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
      acc += output_zp;
      acc = std::max(acc, output_min);
      acc = std::min(acc, output_max);
      output[loc] = static_cast<int8_t>(acc);
    }
  }
}

References MultiplyByQuantizedMultiplier().

◆ Quantize() [2/5]

template<>

void nnfw::cker::Quantize	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		int16_t *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 156 of file Quantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
  static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
 
  for (; i <= flat_size - 8; i += 8)
  {
    const float *src_data_ptr = input_data + i;
    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
 
    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
 
    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
 
    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
 
    // Clamp the values to fit the target type's range.
    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
 
    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
    vst1_s16(output_data + i, narrowed_val_0);
    vst1_s16(output_data + i + 4, narrowed_val_1);
  }
#endif // NEON
 
  for (; i < flat_size; ++i)
  {
    const float val = input_data[i];
    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [3/5]

template<>

void nnfw::cker::Quantize	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		int8_t *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 50 of file Quantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
  static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
 
  for (; i <= flat_size - 8; i += 8)
  {
    const float *src_data_ptr = input_data + i;
    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
 
    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
 
    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
 
    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
 
    // Clamp the values to fit the target type's range.
    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
 
    const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
    const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
    const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
    const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
    vst1_s8(output_data + i, combined_val_narrowed);
  }
#endif // NEON
 
  for (; i < flat_size; ++i)
  {
    const float val = input_data[i];
    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [4/5]

template<>

void nnfw::cker::Quantize	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data,
		const float	scale,
		const int32_t	zero_point
	)

inline

Definition at line 103 of file Quantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
  static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
 
  int i = 0;
#ifdef USE_NEON
  const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
  const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
  const int32x4_t min_val_dup = vdupq_n_s32(min_val);
  const int32x4_t max_val_dup = vdupq_n_s32(max_val);
 
  for (; i <= flat_size - 8; i += 8)
  {
    const float *src_data_ptr = input_data + i;
    float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
    float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
 
    input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
    input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
 
    int32x4_t casted_val_0 = RoundToNearest(input_val_0);
    int32x4_t casted_val_1 = RoundToNearest(input_val_1);
 
    casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
    casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
 
    // Clamp the values to fit the target type's range.
    casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
    casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
    casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
    casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
 
    const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
    const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
    const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1);
    const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
    vst1_u8(output_data + i, combined_val_narrowed);
  }
#endif // NEON
 
  for (; i < flat_size; ++i)
  {
    const float val = input_data[i];
    const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
    const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ Quantize() [5/5]

template<typename InputT , typename OutputT >

void nnfw::cker::Quantize	(	const Shape &	input_shape,
		const InputT *	input_data,
		const Shape &	output_shape,
		OutputT *	output_data,
		const float	output_scale,
		const int32_t	output_offset
	)

inline

Definition at line 34 of file Quantize.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  int min_val = std::numeric_limits<OutputT>::min();
  int max_val = std::numeric_limits<OutputT>::max();
 
  for (int i = 0; i < flat_size; i++)
  {
    int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
    output_data[i] = clamped;
  }
}

References MatchingFlatSize(), and output_shape.

Referenced by onert::backend::cpu::ops::affineQuantize(), and nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral().

◆ QuantizeMultiplier()

void nnfw::cker::QuantizeMultiplier	(	double	double_multiplier,
		int32_t *	quantized_multiplier,
		int *	shift
	)

inline

Definition at line 48 of file Utils.h.

{
  if (double_multiplier == 0.)
  {
    *quantized_multiplier = 0;
    *shift = 0;
    return;
  }
 
  const double q = std::frexp(double_multiplier, shift);
  auto q_fixed = static_cast<int64_t>(round(q * (1ll << 31)));
 
  assert(q_fixed <= (1ll << 31));
  if (q_fixed == (1ll << 31))
  {
    q_fixed /= 2;
    ++*shift;
  }
  assert(q_fixed <= std::numeric_limits<int32_t>::max());
  // A shift amount smaller than -31 would cause all bits to be shifted out
  // and thus all results would be zero. We implement that instead with
  // q_fixed==0, so as to avoid hitting issues with right-shift
  // operations with shift amounts greater than 31. Note that this happens
  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
  // that we're effectively flushing tiny double_multiplier's to zero.
  // We could conceivably handle values in the range (roughly) [32, 63]
  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
  // the present handling is just doing 'flush denormals to zero'. We could
  // reconsider and actually generate nonzero denormals if a need arises.
  if (*shift < -31)
  {
    *shift = 0;
    q_fixed = 0;
  }
  *quantized_multiplier = static_cast<int32_t>(q_fixed);
}

Referenced by QuantizeMultiplierSmallerThanOneExp().

◆ QuantizeMultiplierSmallerThanOneExp()

void nnfw::cker::QuantizeMultiplierSmallerThanOneExp	(	double	double_multiplier,
		int32_t *	quantized_multiplier,
		int *	left_shift
	)

inline

Definition at line 85 of file Utils.h.

{
  assert(double_multiplier < 1.0);
  assert(double_multiplier > 0.0);
  int shift;
  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
  assert(shift <= 0);
  *left_shift = shift;
}

References QuantizeMultiplier().

◆ QuantizeSoftmaxOutput()

template<typename T >

int32_t nnfw::cker::QuantizeSoftmaxOutput	(	float	prob_rescaled,
		int32_t	zero_point
	)

inline

Definition at line 134 of file SoftMax.h.

{
  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
  return prob_rnd + zero_point;
}

◆ QuantizeSoftmaxOutput< uint8_t >()

template<>

int32_t nnfw::cker::QuantizeSoftmaxOutput< uint8_t >	(	float	prob_rescaled,
		int32_t
	)

inline

Definition at line 142 of file SoftMax.h.

{
  return static_cast<int32_t>(prob_rescaled + 0.5f);
}

◆ Range()

template<typename T >

void nnfw::cker::Range	(	const T *	start_data,
		const T *	limit_data,
		const T *	delta_data,
		T *	output_data
	)

inline

Definition at line 48 of file Range.h.

{
  const T start_value = *start_data;
  const T delta_value = *delta_data;
  const T limit_value = *limit_data;
 
  const int num_elements = GetSize<T>(start_value, limit_value, delta_value);
  T value = start_value;
 
  for (int i = 0; i < num_elements; ++i)
  {
    output_data[i] = value;
    value += delta_value;
  }
}

◆ RankOneSelect()

template<typename D , typename T >

void nnfw::cker::RankOneSelect	(	const Shape &	input_condition_shape,
		const D *	input_condition_data,
		const Shape &	input_x_shape,
		const T *	input_x_data,
		const Shape &	input_y_shape,
		const T *	input_y_data,
		const Shape &	output_shape,
		T *	output_data
	)

Definition at line 45 of file Select.h.

{
  const int64_t outer_size = input_condition_shape.FlatSize();
  assert(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0) == outer_size);
  const int64_t inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
 
  int64_t offset = 0;
  for (int64_t i = 0; i < outer_size; i++)
  {
    const T *input_data = (input_condition_data[i] != 0) ? input_x_data : input_y_data;
    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
    offset += inner_size;
  }
}

References nnfw::cker::Shape::FlatSize(), MatchingDim(), MatchingFlatSizeSkipDim(), offset(), and output_shape.

◆ ReducedOutputOffset()

size_t nnfw::cker::ReducedOutputOffset	(	const int	num_dims,
		const int *	dims,
		const int *	index,
		const int	num_axis,
		const int *	axis
	)

inline

Definition at line 420 of file Utils.h.

{
  if (num_dims == 0)
  {
    return 0;
  }
 
  assert(dims != nullptr);
  assert(index != nullptr);
 
  size_t offset = 0;
  for (int idx = 0; idx < num_dims; ++idx)
  {
    // if we need to skip this axis
    bool is_axis = false;
    if (axis != nullptr)
    {
      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx)
      {
        if (idx == axis[axis_idx])
        {
          is_axis = true;
          break;
        }
      }
    }
    if (!is_axis)
    {
      offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]);
    }
  }
  return offset;
}

References offset().

Referenced by ReduceImpl(), ReduceMeanImpl(), and ReduceSumQuantImpl().

◆ ReduceImpl()

template<typename In , typename Out >

bool nnfw::cker::ReduceImpl	(	const In *	input_data,
		const Shape &	input_shape,
		const Shape &	,
		const int *	axis,
		const int	num_axis,
		int *	input_iter,
		Out	reducerconst Out current, const In in,
		Out *	output_data
	)

inline

Definition at line 118 of file Reduce.h.

{
  const auto input_dims = input_shape.DimsData();
  const auto input_num_dims = input_shape.DimensionsCount();
 
  // Reset input iterator.
  if (num_axis == 1 && axis[0] == input_num_dims - 1)
  {
    int input_size = 1;
    int reduce_size = 0;
    for (int idx = 0; idx < input_num_dims - 1; idx++)
    {
      input_size *= input_dims[idx];
    }
    reduce_size = input_dims[input_num_dims - 1];
    for (int idx = 0; idx < input_size; idx++)
    {
      for (int r_idx = 0; r_idx < reduce_size; r_idx++)
      {
        if (r_idx == 0)
        {
          output_data[idx] = input_data[idx * reduce_size];
        }
        else
        {
          output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
        }
      }
    }
    return true;
  }
 
  for (int idx = 0; idx < input_num_dims; ++idx)
  {
    input_iter[idx] = 0;
  }
  // Iterate through input_data.
  do
  {
    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
    size_t output_offset =
      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
    output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
  } while (NextIndex(input_num_dims, input_dims, input_iter));
  return true;
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReduceMeanImpl()

template<typename In , typename Out >

bool nnfw::cker::ReduceMeanImpl	(	const In *	input_data,
		const Shape &	input_shape,
		const int *	axis,
		const int	num_axis,
		int *	input_iter,
		Out	reducerconst Out current, const In in, int normalizer,
		Out *	output_data
	)

inline

Definition at line 52 of file ReduceMean.h.

{
  const auto input_dims = input_shape.DimsData();
  const auto input_num_dims = input_shape.DimensionsCount();
  int normalizer = 1;
  // Reset input iterator.
  for (int idx = 0; idx < input_num_dims; ++idx)
  {
    input_iter[idx] = 0;
  }
  // Compute number of output elements
  for (int idx = 0; idx < num_axis; ++idx)
  {
    normalizer *= input_dims[axis[idx]];
  }
  // Iterate through input_data.
  do
  {
    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
    size_t output_offset =
      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
    output_data[output_offset] =
      reducer(output_data[output_offset], input_data[input_offset], normalizer);
  } while (NextIndex(input_num_dims, input_dims, input_iter));
  return true;
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReduceSumQuantImpl()

template<typename In >

size_t nnfw::cker::ReduceSumQuantImpl	(	const In *	input_data,
		const Shape &	input_shape,
		const int *	axis,
		const int	num_axis,
		int *	input_iter,
		int	reducerconst int current, const In in,
		int *	temp_sum
	)

inline

Definition at line 83 of file ReduceMean.h.

{
  const auto input_dims = input_shape.DimsData();
  const auto input_num_dims = input_shape.DimensionsCount();
  size_t normalizer = 1;
  // Reset input iterator.
  for (int idx = 0; idx < input_num_dims; ++idx)
  {
    input_iter[idx] = 0;
  }
  // Compute number of output elements
  for (int idx = 0; idx < num_axis; ++idx)
  {
    normalizer *= input_dims[axis[idx]];
  }
  // Iterate through input_data.
  do
  {
    size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
    size_t output_offset =
      ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
    temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
  } while (NextIndex(input_num_dims, input_dims, input_iter));
  return normalizer;
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), NextIndex(), and ReducedOutputOffset().

◆ ReLU()

void nnfw::cker::ReLU	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 32 of file ReLU.h.

{
  const auto input_map = MapAsVector(input_data, input_shape);
  auto output_map = MapAsVector(output_data, output_shape);
  output_map = input_map.cwiseMax(0.0f);
}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ ReLU6()

void nnfw::cker::ReLU6	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 32 of file ReLU6.h.

{
  const auto input_map = MapAsVector(input_data, input_shape);
  auto output_map = MapAsVector(output_data, output_shape);
 
  if (output_shape != input_shape)
    throw std::runtime_error{"cker::ReLU6: Do not match input and output shapes."};
 
  output_map = input_map.cwiseMax(0.0f).cwiseMin(6.0f);
}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ Requantize()

template<typename input_type , typename output_type >

void nnfw::cker::Requantize	(	const input_type *	input_data,
		int32_t	size,
		int32_t	effective_scale_multiplier,
		int32_t	effective_scale_shift,
		int32_t	input_zeropoint,
		int32_t	output_zeropoint,
		output_type *	output_data
	)

inline

Definition at line 301 of file Quantize.h.

{
  assert(!"Requantize: not supported type. It shouldn't reach here.");
  UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
             output_zeropoint, output_data);
}

References size.

◆ Requantize< int8_t, uint8_t >()

template<>

void nnfw::cker::Requantize< int8_t, uint8_t >	(	const int8_t *	input_data,
		int32_t	size,
		int32_t	effective_scale_multiplier,
		int32_t	effective_scale_shift,
		int32_t	input_zeropoint,
		int32_t	output_zeropoint,
		uint8_t *	output_data
	)

inline

Definition at line 379 of file Quantize.h.

{
  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
 
  int i = 0;
#ifdef USE_NEON
  // Constants.
  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
 
  for (; i <= size - 16; i += 16)
  {
    const int8x16_t input_vec = vld1q_s8(input_data + i);
    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
    int32x4x4_t input;
    input.val[0] = vmovl_s16(vget_low_s16(first_half));
    input.val[1] = vmovl_s16(vget_high_s16(first_half));
    input.val[2] = vmovl_s16(vget_low_s16(second_half));
    input.val[3] = vmovl_s16(vget_high_s16(second_half));
    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
 
    int32x4x4_t result =
      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
 
    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
 
    const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
    const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
    const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
    const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
 
    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
    const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
    const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
    const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
    vst1q_u8(output_data + i, narrowed_result);
  }
 
#endif
  for (; i < size; ++i)
  {
    const int32_t input = input_data[i] - input_zeropoint;
    const int32_t output =
      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
      output_zeropoint;
    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
    output_data[i] = static_cast<uint8_t>(clamped_output);
  }
}

References MultiplyByQuantizedMultiplier(), and size.

Referenced by onert::backend::cpu::ops::QuantizeLayer::run().

◆ Requantize< uint8_t, int8_t >()

template<>

void nnfw::cker::Requantize< uint8_t, int8_t >	(	const uint8_t *	input_data,
		int32_t	size,
		int32_t	effective_scale_multiplier,
		int32_t	effective_scale_shift,
		int32_t	input_zeropoint,
		int32_t	output_zeropoint,
		int8_t *	output_data
	)

inline

Definition at line 311 of file Quantize.h.

{
  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
 
  int i = 0;
#ifdef USE_NEON
  // Constants.
  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
 
  for (; i <= size - 16; i += 16)
  {
    const uint8x16_t input_vec = vld1q_u8(input_data + i);
    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
    int32x4x4_t input;
    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
 
    int32x4x4_t result =
      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
 
    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
 
    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
    const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
    const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
    const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
    vst1q_s8(output_data + i, narrowed_result);
  }
 
#endif
  for (; i < size; ++i)
  {
    const int32_t input = input_data[i] - input_zeropoint;
    const int32_t output =
      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
      output_zeropoint;
    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
    output_data[i] = static_cast<int8_t>(clamped_output);
  }
}

References MultiplyByQuantizedMultiplier(), and size.

Referenced by onert::backend::cpu::ops::QuantizeLayer::run().

◆ ResizeBilinear() [1/3]

void nnfw::cker::ResizeBilinear	(	const ResizeBilinearParams &	op_params,
		const Shape &	unextended_input_shape,
		const int8_t *	input_data,
		const Shape &	unextended_output_shape,
		int8_t *	output_data
	)

inline

Definition at line 285 of file ResizeBilinear.h.

{
  // If half_pixel_centers is True, align_corners must be False.
  assert(!op_params.half_pixel_centers || !op_params.align_corners);
  assert(unextended_input_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int32_t input_height = input_shape.Dims(1);
  const int32_t input_width = input_shape.Dims(2);
  const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
  const int32_t output_height = op_params.output_height;
  const int32_t output_width = op_params.output_width;
 
  int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
  int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
  if (op_params.align_corners && output_height > 1)
  {
    height_scale_10 =
      ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
  }
  if (op_params.align_corners && output_width > 1)
  {
    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
  }
 
  for (int b = 0; b < batches; ++b)
  {
    for (int y = 0; y < output_height; ++y)
    {
      int32_t input_y, y0, y1;
      ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
                                 &input_y, &y0, &y1);
      for (int x = 0; x < output_width; ++x)
      {
        int32_t input_x, x0, x1;
        ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
                                   &input_x, &x0, &x1);
        for (int c = 0; c < depth; ++c)
        {
          const int64_t output_20_ll =
            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
            ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
          const int64_t output_20_lu =
            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
            (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
          const int64_t output_20_rl =
            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
            ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
          const int64_t output_20_ru =
            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
            (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
          const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
          const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
        }
      }
    }
  }
}

References nnfw::cker::ResizeBilinearParams::align_corners, ComputeInterpolationValues(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), Offset(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, and nnfw::cker::ResizeBilinearParams::output_width.

◆ ResizeBilinear() [2/3]

void nnfw::cker::ResizeBilinear	(	ResizeBilinearParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

Definition at line 213 of file ResizeBilinear.h.

{
  int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0));
  int32_t input_height = input_shape.Dims(1);
  int32_t input_width = input_shape.Dims(2);
  int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3));
 
  // Specialize for 2x2 upsample.
  if (!params.align_corners && !params.half_pixel_centers &&
      params.output_height == 2 * input_height && params.output_width == 2 * input_width)
  {
    ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height,
                      params.output_width, input_shape, input_data, output_shape, output_data);
  }
  else
  {
    float height_scale = static_cast<float>(input_height) / params.output_height;
    float width_scale = static_cast<float>(input_width) / params.output_width;
    if (params.align_corners && params.output_height > 1)
    {
      height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1);
    }
    if (params.align_corners && params.output_width > 1)
    {
      width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1);
    }
 
    ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height,
                          params.output_width, height_scale, width_scale, input_shape, input_data,
                          output_data, params.half_pixel_centers);
  }
}

References nnfw::cker::ResizeBilinearParams::align_corners, nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, nnfw::cker::ResizeBilinearParams::output_width, ResizeBilinear2x2(), and ResizeBilinearGeneric().

Referenced by onert::backend::cpu::ops::ResizeBilinearLayer::run().

◆ ResizeBilinear() [3/3]

void nnfw::cker::ResizeBilinear	(	ResizeBilinearParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	output_shape,
		uint8_t *	output_data
	)

Definition at line 247 of file ResizeBilinear.h.

{
  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
  int32_t input_height = input_shape.Dims(1);
  int32_t input_width = input_shape.Dims(2);
  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
 
  float height_scale = (params.align_corners && params.output_height > 1)
                         ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
                         : (static_cast<float>(input_height) / params.output_height);
 
  float width_scale = (params.align_corners && params.output_width > 1)
                        ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
                        : (static_cast<float>(input_width) / params.output_width);
 
  ResizeBilinearGenericSmallChannel<uint8_t>(
    batches, input_height, input_width, depth, params.output_height, params.output_width,
    height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
}

References nnfw::cker::ResizeBilinearParams::align_corners, nnfw::cker::Shape::Dims(), nnfw::cker::ResizeBilinearParams::half_pixel_centers, MatchingDim(), nnfw::cker::ResizeBilinearParams::output_height, output_shape, and nnfw::cker::ResizeBilinearParams::output_width.

◆ ResizeBilinear2x2()

void nnfw::cker::ResizeBilinear2x2	(	int32_t	batches,
		int32_t	input_height,
		int32_t	input_width,
		int32_t	depth,
		int32_t	output_height,
		int32_t	output_width,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 69 of file ResizeBilinear.h.

{
  for (int b = 0; b < batches; b++)
  {
    for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++)
    {
      for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++)
      {
        int32_t x1 = std::min(x0 + 1, input_width - 1);
        int32_t y1 = std::min(y0 + 1, input_height - 1);
        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data,
                                output_shape, output_data);
      }
    }
  }
}

References output_shape, and ResizeBilinearKernel2x2().

Referenced by ResizeBilinear().

◆ ResizeBilinearGeneric()

void nnfw::cker::ResizeBilinearGeneric	(	int32_t	batches,
		int32_t	input_height,
		int32_t	input_width,
		int32_t	depth,
		int32_t	output_height,
		int32_t	output_width,
		float	height_scale,
		float	width_scale,
		const Shape &	input_shape,
		const float *	input_data,
		float *	output_data,
		const bool	half_pixel_centers
	)

inline

Definition at line 118 of file ResizeBilinear.h.

{
  memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float));
 
  int32_t output_offset = 0;
  for (int b = 0; b < batches; ++b)
  {
    for (int y = 0; y < output_height; ++y)
    {
      float input_y;
      int32_t y0, y1;
      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
                                 &y1);
      for (int x = 0; x < output_width; ++x)
      {
        float input_x;
        int32_t x0, x1;
        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
                                   &x1);
        float *output_ptr = &output_data[output_offset];
 
        // Run kernel on the 4 corners of the bilinear resize algorithm.
        int32_t input_offset = Offset(input_shape, b, y0, x0, 0);
        float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
        const float *input_ptr = &input_data[input_offset];
        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
        input_offset = Offset(input_shape, b, y0, x1, 0);
        scale = (1 - (input_y - y0)) * (input_x - x0);
        input_ptr = &input_data[input_offset];
        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
        input_offset = Offset(input_shape, b, y1, x0, 0);
        scale = (input_y - y0) * (1 - (input_x - x0));
        input_ptr = &input_data[input_offset];
        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
        input_offset = Offset(input_shape, b, y1, x1, 0);
        scale = (input_y - y0) * (input_x - x0);
        input_ptr = &input_data[input_offset];
        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
 
        output_offset += depth;
      }
    }
  }
}

References ComputeInterpolationValues(), Offset(), and ResizeBilinearKernel().

Referenced by ResizeBilinear().

◆ ResizeBilinearGenericSmallChannel()

template<typename T >

void nnfw::cker::ResizeBilinearGenericSmallChannel	(	int32_t	batches,
		int32_t	input_height,
		int32_t	input_width,
		int32_t	depth,
		int32_t	output_height,
		int32_t	output_width,
		float	height_scale,
		float	width_scale,
		const Shape &	input_shape,
		const T *	input_data,
		T *	output_data,
		const bool	half_pixel_centers
	)

inline

Definition at line 171 of file ResizeBilinear.h.

{
  T *output_ptr = &output_data[0];
  for (int b = 0; b < batches; ++b)
  {
    for (int y = 0; y < output_height; ++y)
    {
      float input_y;
      int32_t y0, y1;
      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
                                 &y1);
      for (int x = 0; x < output_width; ++x)
      {
        float input_x;
        int32_t x0, x1;
        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
                                   &x1);
 
        int32_t input_offset[4] = {
          Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
          Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
                          (1 - (input_y - y0)) * (input_x - x0),
                          (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
 
        for (int d = 0; d < depth; d++)
        {
          const T *input_ptr = &input_data[d];
          *output_ptr++ = static_cast<T>(
            input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
            input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
        }
      }
    }
  }
}

References ComputeInterpolationValues(), and Offset().

◆ ResizeBilinearKernel()

void nnfw::cker::ResizeBilinearKernel	(	const float *	input_ptr,
		int32_t	depth,
		float	scale,
		float *	output_ptr
	)

inline

Definition at line 89 of file ResizeBilinear.h.

{
  for (int32_t i = 0; i < depth; i++)
  {
    *output_ptr += *input_ptr * scale;
    output_ptr++;
    input_ptr++;
  }
}

Referenced by ResizeBilinearGeneric().

◆ ResizeBilinearKernel2x2()

void nnfw::cker::ResizeBilinearKernel2x2	(	int32_t	x0,
		int32_t	x1,
		int32_t	y0,
		int32_t	y1,
		int32_t	x,
		int32_t	y,
		int32_t	depth,
		int32_t	batch,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 30 of file ResizeBilinear.h.

{
  const int32_t input_width = input_shape.Dims(2);
  const int32_t output_width = output_shape.Dims(2);
 
  const int32_t input_x_offset = (x1 - x0) * depth;
  const int32_t input_y_offset = (y1 - y0) * depth * input_width;
  const int32_t output_x_offset = depth;
  const int32_t output_y_offset = depth * output_width;
 
  for (int ch = 0; ch < depth; ch++)
  {
    const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch);
 
    float x0y0 = input_data[input_offset];
    float x1y0 = input_data[input_offset + input_x_offset];
    float x0y1 = input_data[input_offset + input_y_offset];
    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
 
    // Top left corner.
    const int32_t output_offset = Offset(output_shape, batch, y, x, ch);
    output_data[output_offset] = x0y0;
 
    // Top right corner.
    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
 
    // Bottom left corner.
    float output = (x0y0 + x0y1) / 2;
    output_data[output_offset + output_y_offset] = output;
 
    // Bottom right corner.
    output_data[output_offset + output_x_offset + output_y_offset] =
      (output + ((x1y0 + x1y1) / 2)) / 2;
  }
}

References nnfw::cker::Shape::Dims(), Offset(), and output_shape.

Referenced by ResizeBilinear2x2().

◆ ResolveAxis()

bool nnfw::cker::ResolveAxis	(	const int	num_dims,
		const std::vector< int > &	axes,
		int *	out_axis,
		int *	out_num_axis
	)

inline

Definition at line 169 of file Reduce.h.

{
  auto num_axis = axes.size();
  auto axis = axes.data();
 
  *out_num_axis = 0; // Just in case.
  // Short-circuit axis resolution for scalars; the axis will go unused.
  if (num_dims == 0)
  {
    return true;
  }
  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
  for (size_t idx = 0; idx < num_axis; ++idx)
  {
    // Handle negative index. A positive index 'p_idx' can be represented as a
    // negative index 'n_idx' as: n_idx = p_idx-num_dims
    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
    assert(current >= 0 && current < num_dims);
    bool is_dup = false;
    for (int j = 0; j < *out_num_axis; ++j)
    {
      if (out_axis[j] == current)
      {
        is_dup = true;
        break;
      }
    }
    if (!is_dup)
    {
      out_axis[*out_num_axis] = current;
      *out_num_axis += 1;
    }
  }
  return true;
}

Referenced by nnfw::cker::ReduceMean::PrepareforReduce(), nnfw::cker::Reduce::QuantizedMeanOrSum(), and nnfw::cker::Reduce::ReduceGeneric().

◆ Reverse()

template<typename Scalar >

void nnfw::cker::Reverse	(	int	axis,
		const Shape &	input_shape,
		const Scalar *	input_data,
		const Shape &	,
		Scalar *	output_data
	)

Definition at line 31 of file Reverse.h.

{
  int outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
 
  int copy_size = 1;
  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
  {
    copy_size *= input_shape.Dims(i);
  }
 
  const int dims_at_axis = input_shape.Dims(axis);
  for (int i = 0; i < outer_size; ++i)
  {
    for (int j = 0; j < dims_at_axis; ++j)
    {
      const int start_pos = (i * dims_at_axis + j) * copy_size;
      Scalar *output_ptr = output_data + start_pos;
      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Shape::Dims().

Referenced by nnfw::cker::BCastList< N >::BCastList().

◆ RmsNorm()

void nnfw::cker::RmsNorm	(	const RmsNormParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	gamma_shape,
		const float *	gamma_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 32 of file RmsNorm.h.

{
  bool single_gamma = gamma_shape.DimensionsCount() == 1 && gamma_shape.Dims(0) == 1;
 
  if (input_shape.DimensionsCount() == 4)
  {
    const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
    const int32_t heights = MatchingDim(input_shape, 1, output_shape, 1);
    const int32_t widths = MatchingDim(input_shape, 2, output_shape, 2);
    const int32_t channels = MatchingDim(input_shape, 3, output_shape, 3);
 
    for (int32_t batch = 0; batch < batches; batch++)
    {
      for (int32_t height = 0; height < heights; height++)
      {
        for (int32_t width = 0; width < widths; width++)
        {
          // normalize over last-axis
          double square_sum = 0.0f;
          for (int32_t channel = 0; channel < channels; channel++)
          {
            double input_val = input_data[Offset(input_shape, batch, height, width, channel)];
            square_sum += (input_val * input_val);
          }
          double rms = std::sqrt((square_sum / channels) + params.epsilon);
          for (int32_t channel = 0; channel < channels; channel++)
          {
            double gamma = (single_gamma ? gamma_data[0] : gamma_data[channel]);
            output_data[Offset(output_shape, batch, height, width, channel)] =
              gamma * (input_data[Offset(input_shape, batch, height, width, channel)] / rms);
          }
        }
      }
    }
  }
  else if (input_shape.DimensionsCount() == 3)
  {
    const int32_t heights = MatchingDim(input_shape, 0, output_shape, 0);
    const int32_t widths = MatchingDim(input_shape, 1, output_shape, 1);
    const int32_t channels = MatchingDim(input_shape, 2, output_shape, 2);
 
    for (int32_t height = 0; height < heights; height++)
    {
      for (int32_t width = 0; width < widths; width++)
      {
        // normalize over last-axis
        double square_sum = 0.0f;
        for (int32_t channel = 0; channel < channels; channel++)
        {
          double input_val = input_data[(height * widths + width) * channels + channel];
          square_sum += (input_val * input_val);
        }
        double rms = std::sqrt((square_sum / channels) + params.epsilon);
        for (int32_t channel = 0; channel < channels; channel++)
        {
          double gamma = (single_gamma ? gamma_data[0] : gamma_data[channel]);
          output_data[(height * widths + width) * channels + channel] =
            gamma * (input_data[(height * widths + width) * channels + channel] / rms);
        }
      }
    }
  }
  else
  {
    throw std::runtime_error("cker::RmsNorm: Unsupported input shape");
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::RmsNormParams::epsilon, MatchingDim(), Offset(), and output_shape.

Referenced by onert::backend::cpu::ops::RmsNormLayer::run().

◆ RoPE()

template<typename T >

void nnfw::cker::RoPE	(	const RoPEMode	mode,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	sin_table_shape,
		const T *	sin_table_data,
		const Shape &	cos_table_shape,
		const T *	cos_table_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 32 of file RoPE.h.

{
  if (input_shape.Dims(3) != sin_table_shape.Dims(3))
    throw std::runtime_error("the dimension(3) of input and sin_table do not match");
 
  if (input_shape.Dims(3) != cos_table_shape.Dims(3))
    throw std::runtime_error("the dimension(3) of input and cos_table do not match");
 
  const int32_t i0_n = MatchingDim(input_shape, 0, output_shape, 0);
  const int32_t i1_n = MatchingDim(input_shape, 1, output_shape, 1);
  const int32_t i2_n = MatchingDim(input_shape, 2, output_shape, 2);
  const int32_t i3_n = MatchingDim(input_shape, 3, output_shape, 3);
 
  if (i3_n % 2 != 0)
    throw std::runtime_error("i3_n must be even number");
 
  if (mode == RoPEMode::kGptNeox)
  {
    for (int32_t i0 = 0; i0 < i0_n; ++i0)
    {
      for (int32_t i1 = 0; i1 < i1_n; ++i1)
      {
        for (int32_t i2 = 0; i2 < i2_n; ++i2)
        {
          for (int32_t i3 = 0; i3 < i3_n / 2; ++i3)
          {
            const int32_t offset = Offset(input_shape, i0, i1, i2, i3);
            const T x0 = input_data[offset];
            const T x1 = input_data[offset + i3_n / 2];
 
            output_data[offset] = x0 * cos_table_data[i3] - x1 * sin_table_data[i3];
            output_data[offset + i3_n / 2] =
              x0 * sin_table_data[i3 + i3_n / 2] + x1 * cos_table_data[i3 + i3_n / 2];
          }
        }
      }
    }
  }
  else
  {
    throw std::runtime_error("Unsupported RoPE mode");
  }
}

References nnfw::cker::Shape::Dims(), kGptNeox, MatchingDim(), offset(), Offset(), and output_shape.

◆ Round()

void nnfw::cker::Round	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 65 of file Round.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < flat_size; ++i)
  {
    // Note that this implementation matches that of tensorFlow tf.round
    // and corresponds to the bankers rounding method.
    // cfenv (for fesetround) is not yet supported universally on Android, so
    // using a work around.
    output_data[i] = RoundToNearest(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and RoundToNearest().

◆ round_nearest()

float nnfw::cker::round_nearest ( float value )

Definition at line 29 of file ReduceMean.h.

{
  if (value < 0)
  {
    return static_cast<float>(static_cast<int>(value - 0.5f));
  }
  else
  {
    return static_cast<float>(static_cast<int>(value + 0.5f));
  }
}

Referenced by nnfw::cker::ReduceMean::ReduceOp().

◆ RoundToNearest()

float nnfw::cker::RoundToNearest ( float value )

inline

Definition at line 31 of file Round.h.

{
  auto floor_val = std::floor(value);
  auto diff = value - floor_val;
  if ((diff < 0.5f) || ((diff == 0.5f) && (static_cast<int>(floor_val) % 2 == 0)))
  {
    return floor_val;
  }
  else
  {
    return floor_val = floor_val + 1.0f;
  }
}

Referenced by Quantize(), Quantize(), Quantize(), and Round().

◆ Rsqrt()

void nnfw::cker::Rsqrt	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 59 of file Elementwise.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = 1.f / std::sqrt(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ Select()

template<typename D , typename T >

void nnfw::cker::Select	(	const Shape &	input_condition_shape,
		const D *	input_condition_data,
		const Shape &	input_x_shape,
		const T *	input_x_data,
		const Shape &	input_y_shape,
		const T *	input_y_data,
		const Shape &	output_shape,
		T *	output_data
	)

Definition at line 32 of file Select.h.

{
  const int64_t flatsize =
    MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
  for (int64_t i = 0; i < flatsize; ++i)
  {
    output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
  }
}

References MatchingFlatSize(), and output_shape.

◆ Sin()

void nnfw::cker::Sin	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 31 of file Elementwise.h.

{
  const int size = MatchingFlatSize(input_shape, output_shape);
  for (int i = 0; i < size; i++)
  {
    output_data[i] = std::sin(input_data[i]);
  }
}

References MatchingFlatSize(), output_shape, and size.

◆ Slice() [1/2]

template<typename T >

void nnfw::cker::Slice	(	const SliceParams &	op_params,
		const Shape &	input_shape,
		const T *	input_data,
		T *	output_data
	)

inline

Definition at line 72 of file Slice.h.

{
  SequentialTensorWriter<T> writer(input_data, output_data);
  return Slice(op_params, input_shape, &writer);
}

References Slice().

◆ Slice() [2/2]

template<typename T >

void nnfw::cker::Slice	(	const SliceParams &	op_params,
		const Shape &	input_shape,
		SequentialTensorWriter< T > *	writer
	)

inline

Definition at line 31 of file Slice.h.

{
  // TODO(dkalenichenko): This op only supports 4D tensors or smaller.
  assert(op_params.begin_count <= 4);
  assert(op_params.size_count <= 4);
 
  const int begin_count = op_params.begin_count;
  const int size_count = op_params.size_count;
  // We front-pad the begin and size vectors.
  const int start_b = 4 - begin_count > 0 ? 0 : op_params.begin[0];
  const int stop_b = (4 - size_count > 0 || op_params.size[0] == -1) ? input_shape.Dims(0)
                                                                     : start_b + op_params.size[0];
  const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
  const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
                       ? input_shape.Dims(1)
                       : start_h + op_params.size[size_count - 3];
  const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
  const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
                       ? input_shape.Dims(2)
                       : start_w + op_params.size[size_count - 2];
  const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
  const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
                       ? input_shape.Dims(3)
                       : start_d + op_params.size[size_count - 1];
 
  for (int in_b = start_b; in_b < stop_b; ++in_b)
  {
    for (int in_h = start_h; in_h < stop_h; ++in_h)
    {
      for (int in_w = start_w; in_w < stop_w; ++in_w)
      {
        const int len = stop_d - start_d;
        if (len > 0)
          writer->WriteN(Offset(input_shape, in_b, in_h, in_w, start_d), len);
      }
    }
  }
}

References nnfw::cker::SliceParams::begin, begin_count, nnfw::cker::SliceParams::begin_count, nnfw::cker::Shape::Dims(), Offset(), nnfw::cker::SliceParams::size, size_count, nnfw::cker::SliceParams::size_count, and nnfw::cker::SequentialTensorWriter< T >::WriteN().

Referenced by Slice().

◆ Softmax() [1/3]

void nnfw::cker::Softmax	(	const float *	in,
		const int	input_size,
		const int	batch_size,
		const float	beta,
		float *	out
	)

inline

Definition at line 79 of file SoftMax.h.

{
  assert(input_size > 0);
 
  // For each batch
  for (int b = 0; b < batch_size; b++)
  {
    // Find the max coeff.
    float max_coeff = in[0];
    for (int i = 1; i < input_size; i++)
    {
      if (in[i] > max_coeff)
        max_coeff = in[i];
    }
 
    // Compute the normalized sum of exps.
    float exp_sum = 0.0;
    for (int i = 0; i < input_size; i++)
    {
      out[i] = std::exp((in[i] - max_coeff) * beta);
      exp_sum += out[i];
    }
 
    // Divide by the sum of exps.
    float reciprocal_sum_exp = 1.f / exp_sum;
    for (int i = 0; i < input_size; i++)
    {
      out[i] *= reciprocal_sum_exp;
    }
 
    // Advance in and out pointers for the next batch.
    in += input_size;
    out += input_size;
  }
}

Referenced by onert::backend::cpu::ops::SoftMaxLayer::softmaxFloat32().

◆ Softmax() [2/3]

void nnfw::cker::Softmax	(	const SoftmaxParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 116 of file SoftMax.h.

{
  // Validate whether if shapes of input and output are the same
  MatchingFlatSize(input_shape, output_shape);
 
  const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
  auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
  // Compute the exponential first, removing the max coefficient for numerical
  // stability.
  out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
  // We are separating out the exp function so that exp can be vectorized.
  out_mat = out_mat.array().exp();
  // Normalize to get the activations.
  Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
  out_mat.array().rowwise() *= scale;
}

References nnfw::cker::SoftmaxParams::beta, MapAsMatrixWithLastDimAsRows(), MatchingFlatSize(), and output_shape.

◆ Softmax() [3/3]

template<typename In , typename Out >

void nnfw::cker::Softmax	(	const SoftmaxParams &	params,
		const Shape &	input_shape,
		const In *	input_data,
		const Shape &	output_shape,
		Out *	output_data
	)

inline

Definition at line 159 of file SoftMax.h.

{
  const int trailing_dim = input_shape.DimensionsCount() - 1;
  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
  const int32_t clamp_max = std::numeric_limits<Out>::max();
  const int32_t clamp_min = std::numeric_limits<Out>::min();
  for (int i = 0; i < excluding_last_dim; ++i)
  {
    int32_t max_val = std::numeric_limits<In>::min();
    // Find max quantized value.
    for (int j = 0; j < last_dim; ++j)
    {
      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
    }
 
    float sum_exp = 0.0f;
    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
    const float *table_offset = &params.table[max_uint8 - max_val];
    // Calculate normalizer sum(exp(x)).
    for (int j = 0; j < last_dim; ++j)
    {
      sum_exp += table_offset[input_data[j]];
    }
 
    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
    // Normalize and quantize probabilities.
    for (int j = 0; j < last_dim; ++j)
    {
      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
      const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
      output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
    }
    input_data += last_dim;
    output_data += last_dim;
  }
}

References nnfw::cker::Shape::DimensionsCount(), MatchingDim(), MatchingFlatSizeSkipDim(), output_shape, nnfw::cker::SoftmaxParams::scale, nnfw::cker::SoftmaxParams::table, and nnfw::cker::SoftmaxParams::zero_point.

◆ SpaceToBatchND()

template<typename T >

void nnfw::cker::SpaceToBatchND	(	const SpaceToBatchParams &	params,
		const Shape &	unextended_input_shape,
		const T *	input_data,
		const Shape &	unextended_block_shape_shape,
		const int32_t *	block_shape_data,
		const Shape &	unextended_padding_shape,
		const int32_t *	paddings_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 31 of file SpaceToBatchND.h.

{
  assert(unextended_input_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  const int depth = input_shape.Dims(3);
  const int input_width = input_shape.Dims(2);
  const int input_height = input_shape.Dims(1);
  const int input_batch_size = input_shape.Dims(0);
 
  const int output_width = output_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_batch_size = output_shape.Dims(0);
 
  const int block_shape_height = block_shape_data[0];
  const int block_shape_width = block_shape_data[1];
  const int padding_top = paddings_data[0];
  const int padding_left = paddings_data[2];
 
  // For uint8 quantized, the correct padding "zero value" is the output offset.
  const int32_t pad_value = params.output_offset;
 
  for (int out_b = 0; out_b < output_batch_size; ++out_b)
  {
    int input_batch = out_b % input_batch_size;
    int shift_w = (out_b / input_batch_size) % block_shape_width;
    int shift_h = (out_b / input_batch_size) / block_shape_width;
    for (int out_h = 0; out_h < output_height; ++out_h)
    {
      for (int out_w = 0; out_w < output_width; ++out_w)
      {
        T *out = output_data + Offset(output_shape, out_b, out_h, out_w, 0);
        if (out_h * block_shape_height + shift_h < padding_top ||
            out_h * block_shape_height + shift_h >= padding_top + input_height ||
            out_w * block_shape_width + shift_w < padding_left ||
            out_w * block_shape_width + shift_w >= padding_left + input_width)
        {
          // This may not execute correctly when pad_value != 0 and T != uint8.
          memset(out, pad_value, depth * sizeof(T));
        }
        else
        {
          const T *in =
            input_data + Offset(input_shape, input_batch,
                                (out_h * block_shape_height + shift_h) - padding_top,
                                (out_w * block_shape_width + shift_w) - padding_left, 0);
          memcpy(out, in, depth * sizeof(T));
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), nnfw::cker::SpaceToBatchParams::output_offset, and output_shape.

◆ SpaceToDepth()

template<typename T >

void nnfw::cker::SpaceToDepth	(	const SpaceToDepthParams &	params,
		const Shape &	unextended_input_shape,
		const T *	input_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 30 of file SpaceToDepth.h.

{
  assert(unextended_input_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  const int output_depth = output_shape.Dims(3);
  const int output_width = output_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
 
  const int input_depth = input_shape.Dims(3);
  const int batch_size = input_shape.Dims(0);
 
  // Number of continuous values that we can copy in one interation.
  const int stride = params.block_size * input_depth;
 
  for (int batch = 0; batch < batch_size; ++batch)
  {
    for (int out_h = 0; out_h < output_height; ++out_h)
    {
      T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
      for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
      {
        T *dst = output_ptr;
        for (int out_w = 0; out_w < output_width; ++out_w)
        {
          memcpy(dst, input_data, stride * sizeof(T));
          input_data += stride;
          dst += output_depth;
        }
        output_ptr += stride;
      }
    }
  }
}

References nnfw::cker::SpaceToDepthParams::block_size, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), Offset(), and output_shape.

◆ Split()

template<typename Scalar >

void nnfw::cker::Split	(	const SplitParams &	params,
		const Shape &	input_shape,
		const Scalar *	input_data,
		const Shape &	output_shape,
		Scalar const	output_data
	)

Definition at line 30 of file Split.h.

{
  const int split_dimensions = input_shape.DimensionsCount();
  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
  int outputs_count = params.num_split;
 
  int64_t outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
  // For all output arrays,
  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
  int64_t base_inner_size = 1;
  for (int i = axis + 1; i < split_dimensions; ++i)
  {
    base_inner_size *= input_shape.Dims(i);
  }
 
  const Scalar *input_ptr = input_data;
  for (int k = 0; k < outer_size; k++)
  {
    for (int i = 0; i < outputs_count; ++i)
    {
      const int copy_size = output_shape.Dims(axis) * base_inner_size;
      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
      input_ptr += copy_size;
    }
  }
}

References nnfw::cker::SplitParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::SplitParams::num_split, and output_shape.

◆ SplitV()

template<typename Scalar >

void nnfw::cker::SplitV	(	const SplitVParams &	params,
		const Shape &	input_shape,
		const Scalar *	input_data,
		std::vector< nnfw::cker::Shape > &	output_shapes,
		Scalar const	output_data
	)

Definition at line 30 of file SplitV.h.

{
  const int split_dimensions = input_shape.DimensionsCount();
  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
  int outputs_count = params.num_split;
 
  for (int i = 0; i < outputs_count; i++)
  {
    // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
    for (int j = 0; j < split_dimensions; j++)
    {
      if (j != axis)
      {
        MatchingDim(output_shapes[i], j, input_shape, j);
      }
    }
  }
 
  int64_t outer_size = 1;
  for (int i = 0; i < axis; ++i)
  {
    outer_size *= input_shape.Dims(i);
  }
  // For all output arrays,
  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
  int64_t base_inner_size = 1;
  for (int i = axis + 1; i < split_dimensions; ++i)
  {
    base_inner_size *= input_shape.Dims(i);
  }
 
  const Scalar *input_ptr = input_data;
  int copy_size = 0;
  for (int k = 0; k < outer_size; k++)
  {
    for (int i = 0; i < outputs_count; ++i)
    {
      copy_size = output_shapes[i].Dims(axis) * base_inner_size;
      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
      input_ptr += copy_size;
    }
  }
}

References nnfw::cker::SplitVParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), MatchingDim(), and nnfw::cker::SplitVParams::num_split.

◆ SqDiff()

template<typename T >

void nnfw::cker::SqDiff	(	const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data
	)

Definition at line 63 of file SqDiff.h.

{
  assert(input1_shape.DimensionsCount() > 0 && input2_shape.DimensionsCount() > 0 &&
         output_shape.DimensionsCount() > 0);
  int outRank = output_shape.DimensionsCount();
 
  switch (outRank)
  {
    case 4:
      SQDIFF(4);
      break;
 
    case 3:
      SQDIFF(3);
      break;
 
    case 2:
      SQDIFF(2);
      break;
 
    case 1:
      SQDIFF(1);
      break;
 
    default:
      throw std::runtime_error("Support up to 4-D tensors at present");
      break;
  }
}

References nnfw::cker::Shape::DimensionsCount(), output_shape, and SQDIFF.

Referenced by onert::backend::cpu::ops::SqDiffLayer::SqDiffFloat32().

◆ SqDiffImpl()

template<typename T , int N>

void nnfw::cker::SqDiffImpl	(	const Shape &	input1_shape,
		const T *	input1_data,
		const Shape &	input2_shape,
		const T *	input2_data,
		const Shape &	output_shape,
		T *	output_data,
		NdArrayDesc< N > *	desc1_in,
		NdArrayDesc< N > *	desc2_in,
		NdArrayDesc< N > *	desc_out
	)

Definition at line 40 of file SqDiff.h.

{
  std::vector<int> input_iter;
  input_iter.resize(N);
  const auto output_dims = output_shape.DimsData();
 
  // Copy dims to desc, calculating strides.
  CopyDimsToDesc<N>(output_shape, desc_out);
  NdArrayDescsForElementwiseBroadcast<N>(input1_shape, input2_shape, desc1_in, desc2_in);
 
  do
  {
    int input1_indx = SubscriptToIndexGeneric(desc1_in, input_iter.data());
    int input2_indx = SubscriptToIndexGeneric(desc2_in, input_iter.data());
    int output_indx = SubscriptToIndexGeneric(desc_out, input_iter.data());
    output_data[output_indx] = (input1_data[input1_indx] - input2_data[input2_indx]) *
                               (input1_data[input1_indx] - input2_data[input2_indx]);
  } while (NextIndex(N, output_dims, input_iter.data()));
}

References NextIndex(), output_shape, and SubscriptToIndexGeneric().

◆ Sqrt()

void nnfw::cker::Sqrt	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 101 of file Elementwise.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  for (int i = 0; i < flat_size; i++)
  {
    output_data[i] = std::sqrt(input_data[i]);
  }
}

References MatchingFlatSize(), and output_shape.

◆ Square()

void nnfw::cker::Square	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 112 of file Elementwise.h.

{
  const int flat_size = MatchingFlatSize(input_shape, output_shape);
 
  for (int i = 0; i < flat_size; i++)
  {
    output_data[i] = input_data[i] * input_data[i];
  }
}

References MatchingFlatSize(), and output_shape.

◆ StartForAxis()

int nnfw::cker::StartForAxis	(	const StridedSliceParams &	params,
		const Shape &	input_shape,
		int	axis
	)

inline

Definition at line 83 of file StridedSlice.h.

{
  const auto begin_mask = params.begin_mask;
  const auto *start_indices = params.start_indices;
  const auto *strides = params.strides;
  // Begin with the specified index.
  int start = start_indices[axis];
 
  // begin_mask override
  if (begin_mask & 1 << axis)
  {
    if (strides[axis] > 0)
    {
      // Forward iteration - use the first element. These values will get
      // clamped below (Note: We could have set them to 0 and axis_size-1, but
      // use lowest() and max() to maintain symmetry with StopForAxis())
      start = std::numeric_limits<int>::lowest();
    }
    else
    {
      // Backward iteration - use the last element.
      start = std::numeric_limits<int>::max();
    }
  }
 
  // Handle negative indices
  int axis_size = input_shape.Dims(axis);
  if (start < 0)
  {
    start += axis_size;
  }
 
  // Clamping
  start = Clamp(start, 0, axis_size - 1);
 
  return start;
}

References nnfw::cker::StridedSliceParams::begin_mask, Clamp(), nnfw::cker::Shape::Dims(), nnfw::cker::StridedSliceParams::start_indices, and nnfw::cker::StridedSliceParams::strides.

Referenced by checkOutputSize(), and StridedSlice().

◆ StatelessRandomUniform()

void nnfw::cker::StatelessRandomUniform	(	const Shape &	shape_shape,
		const int32_t *	shape_data,
		const Shape &	seed_shape,
		const int32_t *	seed_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 75 of file StatelessRandomUniform.h.

{
  Tensor shape_t;
  Tensor seed_t;
 
  shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData());
  shape_t.buffer = (void *)shape_data;
 
  seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData());
  seed_t.buffer = (void *)seed_data;
 
  Tensor output_t;
  output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
  output_t.buffer = output_data;
 
  random::PhiloxRandom::Key key;
  random::PhiloxRandom::ResultType counter;
 
  GenerateKey(seed_t, &key, &counter);
 
  Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
    random::PhiloxRandom(counter, key), &output_t);
}

References nnfw::cker::Tensor::buffer, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), GenerateKey(), output_shape, nnfw::cker::Shape::ReplaceWith(), and nnfw::cker::Tensor::shape.

Referenced by onert::backend::cpu::ops::StatelessRandomUniformLayer::StatelessRandomUniformFloat32().

◆ StopForAxis()

int nnfw::cker::StopForAxis	(	const StridedSliceParams &	params,
		const Shape &	input_shape,
		int	axis,
		int	start_for_axis
	)

inline

Definition at line 126 of file StridedSlice.h.

{
  const auto end_mask = params.end_mask;
  const auto shrink_axis_mask = params.shrink_axis_mask;
  const auto *stop_indices = params.stop_indices;
  const auto *strides = params.strides;
 
  // Begin with the specified index
  const bool shrink_axis = shrink_axis_mask & (1 << axis);
  int stop = stop_indices[axis];
 
  // When shrinking an axis, the end position does not matter (and can be
  // incorrect when negative indexing is used, see Issue #19260). Always use
  // start_for_axis + 1 to generate a length 1 slice, since start_for_axis has
  // already been adjusted for negative indices.
  if (shrink_axis)
  {
    stop = start_for_axis + 1;
  }
 
  // end_mask override
  if (end_mask & (1 << axis))
  {
    if (strides[axis] > 0)
    {
      // Forward iteration - use the last element. These values will get
      // clamped below
      stop = std::numeric_limits<int>::max();
    }
    else
    {
      // Backward iteration - use the first element.
      stop = std::numeric_limits<int>::lowest();
    }
  }
 
  // Handle negative indices
  const int axis_size = input_shape.Dims(axis);
  if (stop < 0)
  {
    stop += axis_size;
  }
 
  // Clamping
  // Because the end index points one past the last element, we need slightly
  // different clamping ranges depending on the direction.
  if (strides[axis] > 0)
  {
    // Forward iteration
    stop = Clamp(stop, 0, axis_size);
  }
  else
  {
    // Backward iteration
    stop = Clamp(stop, -1, axis_size - 1);
  }
 
  return stop;
}

References Clamp(), nnfw::cker::Shape::Dims(), nnfw::cker::StridedSliceParams::end_mask, nnfw::cker::StridedSliceParams::shrink_axis_mask, nnfw::cker::StridedSliceParams::stop_indices, and nnfw::cker::StridedSliceParams::strides.

Referenced by checkOutputSize(), and StridedSlice().

◆ StridedSlice()

template<typename T >

void nnfw::cker::StridedSlice	(	const StridedSliceParams &	op_params,
		const Shape &	unextended_input_shape,
		const T *	input_data,
		const Shape &	unextended_output_shape,
		T *	output_data
	)

inline

Definition at line 258 of file StridedSlice.h.

{
  assert(unextended_input_shape.DimensionsCount() <= 4);
  assert(unextended_output_shape.DimensionsCount() <= 4);
 
  bool optimize = true;
  int st_count = op_params.strides_count;
  for (int idx = 0; idx < st_count - 1; idx++)
  {
    const int axis_size = unextended_input_shape.Dims(idx);
    const int start = StartForAxis(op_params, unextended_input_shape, idx);
    const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
    if ((axis_size != 1) && (start != 0 || stop != 0))
    {
      optimize = false;
      break;
    }
  }
 
  if (optimize)
  {
    if (op_params.strides[st_count - 1] == 1)
    {
      const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
      const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
 
      for (int idx = 0; idx < end - start; idx++)
      {
        output_data[idx] = input_data[idx + start];
      }
      return;
    }
  }
 
  // Note that the output_shape is not used herein.
  StridedSliceParams params_copy = op_params;
 
  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
 
  // Reverse and pad to 4 dimensions because that is what the runtime code
  // requires (ie. all shapes must be 4D and are given backwards).
  StridedSlicePadIndices(&params_copy, 4);
 
  const int start_b = StartForAxis(params_copy, input_shape, 0);
  const int stop_b = StopForAxis(params_copy, input_shape, 0, start_b);
  const int start_h = StartForAxis(params_copy, input_shape, 1);
  const int stop_h = StopForAxis(params_copy, input_shape, 1, start_h);
  const int start_w = StartForAxis(params_copy, input_shape, 2);
  const int stop_w = StopForAxis(params_copy, input_shape, 2, start_w);
  const int start_d = StartForAxis(params_copy, input_shape, 3);
  const int stop_d = StopForAxis(params_copy, input_shape, 3, start_d);
 
  T *out_ptr = output_data;
  for (int in_b = start_b; !LoopCondition(in_b, stop_b, params_copy.strides[0]);
       in_b += params_copy.strides[0])
  {
    for (int in_h = start_h; !LoopCondition(in_h, stop_h, params_copy.strides[1]);
         in_h += params_copy.strides[1])
    {
      for (int in_w = start_w; !LoopCondition(in_w, stop_w, params_copy.strides[2]);
           in_w += params_copy.strides[2])
      {
        for (int in_d = start_d; !LoopCondition(in_d, stop_d, params_copy.strides[3]);
             in_d += params_copy.strides[3])
        {
          *out_ptr++ = input_data[Offset(input_shape, in_b, in_h, in_w, in_d)];
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), end(), LoopCondition(), Offset(), output_shape, StartForAxis(), StopForAxis(), StridedSlicePadIndices(), nnfw::cker::StridedSliceParams::strides, and nnfw::cker::StridedSliceParams::strides_count.

◆ StridedSlicePadIndices()

void nnfw::cker::StridedSlicePadIndices	(	StridedSliceParams *	p,
		int	dim_count
	)

inline

Definition at line 42 of file StridedSlice.h.

{
  // Add indices and mask bits to fully include extra dimensions
  assert(dim_count <= 4);
  assert(dim_count >= p->start_indices_count);
  assert(p->start_indices_count == p->stop_indices_count);
  assert(p->stop_indices_count == p->strides_count);
 
  const int pad_count = dim_count - p->start_indices_count;
 
  // Pad indices at start, so move arrays by pad_count.
  for (int i = p->start_indices_count - 1; i >= 0; --i)
  {
    p->strides[i + pad_count] = p->strides[i];
    p->start_indices[i + pad_count] = p->start_indices[i];
    p->stop_indices[i + pad_count] = p->stop_indices[i];
  }
  for (int i = 0; i < pad_count; ++i)
  {
    p->start_indices[i] = 0;
    p->stop_indices[i] = 1;
    p->strides[i] = 1;
  }
 
  // Pad masks with 0s or 1s as required.
  p->shrink_axis_mask <<= pad_count;
  p->ellipsis_mask <<= pad_count;
  p->new_axis_mask <<= pad_count;
  p->begin_mask <<= pad_count;
  p->end_mask <<= pad_count;
  p->begin_mask |= (1 << pad_count) - 1;
  p->end_mask |= (1 << pad_count) - 1;
 
  p->start_indices_count = dim_count;
  p->stop_indices_count = dim_count;
  p->strides_count = dim_count;
}

References p.

Referenced by StridedSlice().

◆ Sub1Vector()

void nnfw::cker::Sub1Vector	(	const float *	vector,
		int	v_size,
		float *	result
	)

inline

Definition at line 115 of file TensorUtils.h.

{
  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
}

References NEON_OR_PORTABLE, and Sub1Vector().

Referenced by Sub1Vector(), and UpdateLstmCellFloat().

◆ SubscriptToIndex()

int nnfw::cker::SubscriptToIndex	(	const NdArrayDesc< 4 > &	desc,
		int	i0,
		int	i1,
		int	i2,
		int	i3
	)

inline

Definition at line 255 of file Utils.h.

{
  assert(i0 >= 0 && i0 < desc.extents[0]);
  assert(i1 >= 0 && i1 < desc.extents[1]);
  assert(i2 >= 0 && i2 < desc.extents[2]);
  assert(i3 >= 0 && i3 < desc.extents[3]);
  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + i3 * desc.strides[3];
}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

Referenced by nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), BroadcastComparison4DSlowImpl(), BroadcastComparison4DSlowWithScaling(), BroadcastSelect4DSlow(), FloorDivBroadcast(), FloorModBroadcast(), LogicalAndBroadcast(), LogicalOrBroadcast(), and MaximumMinimumBroadcast4DSlow().

◆ SubscriptToIndexGeneric()

template<int N>

int nnfw::cker::SubscriptToIndexGeneric	(	const NdArrayDesc< N > *	desc,
		int *	iter
	)

inline

Definition at line 264 of file Utils.h.

{
  int ret_indx = 0;
  for (size_t idx = 0; idx < static_cast<size_t>(N); idx++)
  {
    assert(iter[idx] >= 0 && iter[idx] < desc->extents[idx]);
    ret_indx += iter[idx] * desc->strides[idx];
  }
 
  return ret_indx;
}

References nnfw::cker::NdArrayDesc< N >::extents, and nnfw::cker::NdArrayDesc< N >::strides.

Referenced by SqDiffImpl().

◆ sum_reducer()

template<typename In >

int nnfw::cker::sum_reducer	(	const int	data1,
		const In	data2
	)

Definition at line 46 of file ReduceMean.h.

{
  return data1 + static_cast<int>(data2);
}

Referenced by MeanQ8Asymm().

◆ SymmetricQuantizeFloats()

void nnfw::cker::SymmetricQuantizeFloats	(	const float *	values,
		const int	size,
		int8_t *	quantized_values,
		float *	min,
		float *	max,
		float *	scaling_factor
	)

inline

Definition at line 120 of file TensorUtils.h.

{
  return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max,
                          scaling_factor);
}

References NEON_OR_PORTABLE, size, and SymmetricQuantizeFloats().

Referenced by FullyConnectedHybrid(), and SymmetricQuantizeFloats().

◆ Tanh()

void nnfw::cker::Tanh	(	const Shape &	input_shape,
		const float *	input_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 31 of file Tanh.h.

{
  auto input_map = MapAsVector(input_data, input_shape);
  auto output_map = MapAsVector(output_data, output_shape);
  output_map.array() = input_map.array().tanh();
}

References MapAsVector(), and output_shape.

Referenced by onert::backend::cpu::ops::ElementwiseActivationLayer::configure().

◆ TFLITE_COMPARISON_OP() [1/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Equal )

◆ TFLITE_COMPARISON_OP() [2/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Greater )

◆ TFLITE_COMPARISON_OP() [3/6]

nnfw::cker::TFLITE_COMPARISON_OP ( GreaterEqual )

◆ TFLITE_COMPARISON_OP() [4/6]

nnfw::cker::TFLITE_COMPARISON_OP ( Less )

◆ TFLITE_COMPARISON_OP() [5/6]

nnfw::cker::TFLITE_COMPARISON_OP ( LessEqual )

◆ TFLITE_COMPARISON_OP() [6/6]

nnfw::cker::TFLITE_COMPARISON_OP ( NotEqual )

◆ TileOneDimension()

template<typename T , typename M >

std::pair< int, int > nnfw::cker::TileOneDimension	(	const Shape &	in_dimensions,
		const T *	in_data,
		const M *	multipliers,
		T *	out_data,
		int	dimension
	)

Definition at line 41 of file Tile.h.

{
  const int dimension_size = in_dimensions.Dims(dimension);
  if (dimension == in_dimensions.DimensionsCount() - 1)
  {
    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
    return std::make_pair(dimension_size,
                          dimension_size * static_cast<int>(multipliers[dimension]));
  }
  int total_stride_size = 0, total_tiled_stride_size = 0;
  const T *copy_from_data = in_data;
  T *copy_to_data = out_data;
  for (int i = 0; i < dimension_size; ++i)
  {
    auto [stride_size, tiled_stride_size] =
      TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
    copy_from_data += stride_size;
    copy_to_data += tiled_stride_size;
    total_stride_size += stride_size;
    total_tiled_stride_size += tiled_stride_size;
  }
  CopyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1,
                    out_data + total_tiled_stride_size);
  return std::make_pair(total_stride_size,
                        static_cast<int>(total_tiled_stride_size * multipliers[dimension]));
}

References CopyMultipleTimes(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), and TileOneDimension().

Referenced by TileOneDimension().

◆ To32Bit()

template<typename TensorType >

TTypes< typenameTensorType::Scalar, TensorType::NumIndices >::Tensor32Bit nnfw::cker::To32Bit ( TensorType in )

Definition at line 178 of file Tensor.h.

{
  typedef typename TTypes<typename TensorType::Scalar, TensorType::NumIndices>::Tensor32Bit RetType;
  return RetType(in.data(), To32BitDims(in.dimensions()));
}

References To32BitDims().

Referenced by nnfw::cker::functor::BroadcastTo< Device, T >::DoBCast32Bit().

◆ To32BitDims()

template<typename DSizes >

Eigen::DSizes< Index32, DSizes::count > nnfw::cker::To32BitDims ( const DSizes & in )

Definition at line 166 of file Tensor.h.

{
  Eigen::DSizes<Index32, DSizes::count> out;
  for (int i = 0; i < DSizes::count; ++i)
  {
    out[i] = in[i];
  }
  return out;
}

Referenced by To32Bit().

◆ Transpose()

template<typename T >

void nnfw::cker::Transpose	(	const TransposeParams &	unshrunk_params,
		const Shape &	unshrunk_input_shape,
		const T *	input_data,
		const Shape &	unshrunk_output_shape,
		T *	output_data
	)

Definition at line 509 of file Transpose.h.

{
  const int output_size = unshrunk_output_shape.DimensionsCount();
  assert(unshrunk_input_shape.DimensionsCount() <= 4);
  assert(output_size <= 4);
  assert(output_size == unshrunk_params.perm_count);
 
  Shape shrunk_input_shape = Shape(unshrunk_input_shape);
 
  Shape shrunk_output_shape = Shape(unshrunk_output_shape);
 
  TransposeParams shrunk_params = unshrunk_params;
 
  // Reduce any dimensions that have one size. Lower transpose op usually
  // performs better since memory access patterns will be improved.
  RemoveOneSizeDimensions(&shrunk_input_shape, &shrunk_output_shape, &shrunk_params);
 
  // Handle identity cases.
  // TODO(b/140779653): Add an optimization pass in the conversion process to
  // remove transpose op nodes where they do nothing like the below one.
  bool identical = true;
  for (int i = 0; i < shrunk_params.perm_count; ++i)
 
  {
    if (shrunk_params.perm[i] != i)
 
    {
      identical = false;
      break;
    }
  }
  if (identical)
  {
    memcpy(output_data, input_data, unshrunk_input_shape.FlatSize() * sizeof(T));
    return;
  }
 
  // Reduce dimensions by flattening.
  if (shrunk_params.perm[0] == 0 && output_size >= 3)
 
  {
    Shape non_flatten_input_shape;
    Shape non_flatten_output_shape;
    TransposeParams non_flatten_params;
    const int total_size = shrunk_input_shape.FlatSize();
 
    const int non_flatten_size =
      Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
 
              &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
    assert(non_flatten_params.perm[0] != 0);
 
    for (int i = 0; i < total_size; i += non_flatten_size)
    {
      TransposeImpl(non_flatten_params, non_flatten_input_shape, input_data + i,
                    non_flatten_output_shape, output_data + i);
    }
    return;
  }
 
  // Call non-flattened case.
  TransposeImpl(shrunk_params, shrunk_input_shape, input_data, shrunk_output_shape,
 
                output_data);
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::FlatSize(), nnfw::cker::TransposeParams::perm, nnfw::cker::TransposeParams::perm_count, and TransposeImpl().

Referenced by onert::backend::cpu::ops::TransposeLayer::transpose().

◆ Transpose2D()

template<typename T >

void nnfw::cker::Transpose2D	(	const Shape &	input_shape,
		const T *	input_data,
		const Shape &	output_shape,
		T *	output_data
	)

inline

Definition at line 297 of file Transpose.h.

{
  assert(input_shape.DimensionsCount() == 2);
  assert(output_shape.DimensionsCount() == 2);
 
  const int d0 = input_shape.DimsData()[0];
  const int d1 = input_shape.DimsData()[1];
  const int kLines = 4;
  const int kSkipSize = (kLines - 1) * d1;
 
  const T *input = input_data;
 
  int i = 0;
  for (; i <= d0 - kLines; i += kLines)
  {
    T *output = output_data + i;
 
    const T *input_ptr = input;
    optimized_ops_preload_l1_keep(input_ptr);
    input_ptr += d1;
    optimized_ops_preload_l1_keep(input_ptr);
    input_ptr += d1;
    optimized_ops_preload_l1_keep(input_ptr);
    input_ptr += d1;
    optimized_ops_preload_l1_keep(input_ptr);
 
    int j = 0;
    for (; j <= d1 - kLines; j += kLines)
    {
      input_ptr = input;
      const T a00 = input_ptr[0];
      const T a01 = input_ptr[1];
      const T a02 = input_ptr[2];
      const T a03 = input_ptr[3];
      input_ptr += d1;
      const T a10 = input_ptr[0];
      const T a11 = input_ptr[1];
      const T a12 = input_ptr[2];
      const T a13 = input_ptr[3];
      input_ptr += d1;
      const T a20 = input_ptr[0];
      const T a21 = input_ptr[1];
      const T a22 = input_ptr[2];
      const T a23 = input_ptr[3];
      input_ptr += d1;
      const T a30 = input_ptr[0];
      const T a31 = input_ptr[1];
      const T a32 = input_ptr[2];
      const T a33 = input_ptr[3];
 
      output[0] = a00;
      output[1] = a10;
      output[2] = a20;
      output[3] = a30;
      output += d0;
 
      output[0] = a01;
      output[1] = a11;
      output[2] = a21;
      output[3] = a31;
      output += d0;
 
      output[0] = a02;
      output[1] = a12;
      output[2] = a22;
      output[3] = a32;
      output += d0;
 
      output[0] = a03;
      output[1] = a13;
      output[2] = a23;
      output[3] = a33;
      output += d0;
 
      input += kLines;
    }
    if (j == d1)
    {
      input += kSkipSize;
    }
    else
    {
      for (int p = 0; p < kLines; ++p)
      {
        for (int q = 0; q < d1 - j; ++q)
        {
          *(output + q * d0 + p) = *(input + p * d1 + q);
        }
      }
      input += (d1 - j) + kSkipSize;
    }
  }
  for (; i < d0; ++i)
  {
    T *output = output_data + i;
    for (int j = 0; j < d1; ++j)
    {
      *output = *input;
      output += d0;
      ++input;
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::DimsData(), optimized_ops_preload_l1_keep(), output_shape, and p.

Referenced by TransposeImpl().

◆ Transpose3D()

template<typename T >

void nnfw::cker::Transpose3D	(	const TransposeParams &	params,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	,
		T *	output_data
	)

inline

Definition at line 405 of file Transpose.h.

{
  int s2, s3;
  s2 = input_shape.Dims(1);
  s3 = input_shape.Dims(2);
 
  int p1 = 0;
  int p2 = 0;
  int p3 = 0;
 
  if (params.perm[0] == 2)
  {
    p1 = 1;
  }
  else if (params.perm[1] == 2)
  {
    p2 = 1;
  }
  else
  {
    p3 = 1;
  }
 
  if (params.perm[0] == 1)
  {
    p1 = s3;
  }
  else if (params.perm[1] == 1)
  {
    p2 = s3;
  }
  else
  {
    p3 = s3;
  }
 
  if (params.perm[0] == 0)
  {
    p1 = s2 * s3;
  }
  else if (params.perm[1] == 0)
  {
    p2 = s2 * s3;
  }
  else
  {
    p3 = s2 * s3;
  }
 
  int o_s[3];
  o_s[0] = input_shape.Dims(params.perm[0]);
  o_s[1] = input_shape.Dims(params.perm[1]);
  o_s[2] = input_shape.Dims(params.perm[2]);
 
  for (int i1 = 0; i1 < o_s[0]; ++i1)
  {
    for (int i2 = 0; i2 < o_s[1]; ++i2)
    {
      for (int i3 = 0; i3 < o_s[2]; ++i3)
      {
        const int i = i1 * p1 + i2 * p2 + i3 * p3;
        const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3;
        output_data[o] = input_data[i];
      }
    }
  }
}

References nnfw::cker::Shape::Dims(), and nnfw::cker::TransposeParams::perm.

Referenced by TransposeImpl().

◆ TransposeConv()

void nnfw::cker::TransposeConv	(	const TransposeConvParams &	params,
		const Shape &	input_shape,
		const float *	input_data,
		const Shape &	filter_shape,
		const float *	filter_data,
		const Shape &	output_shape,
		float *	output_data
	)

inline

Definition at line 30 of file TransposeConv.h.

{
 
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
 
  assert(input_shape.DimensionsCount() == 4);
  assert(filter_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
 
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
 
  // Although transpose convolution simplifies to convolution with transposed
  // weights for strides of 1, non-unitary striding complicates matters. To
  // keep this reference implementation as clear as possible, we use a
  // "scatter" access pattern, where we loop through all the input elements,
  // computing their influence on the output, rather than looping through the
  // output elements in the typical "gather" access pattern of a conv. We
  // therefore must initialize the output array to zero.
  const int num_elements = output_shape.FlatSize();
  for (int i = 0; i < num_elements; i++)
  {
    output_data[i] = 0.0f;
  }
 
  // Loop through input elements one at a time.
  for (int batch = 0; batch < batches; ++batch)
  {
    for (int in_y = 0; in_y < input_height; ++in_y)
    {
      for (int in_x = 0; in_x < input_width; ++in_x)
      {
        for (int in_channel = 0; in_channel < input_depth; ++in_channel)
        {
          // Loop through the output elements it will influence
          const int out_x_origin = (in_x * stride_width) - pad_width;
          const int out_y_origin = (in_y * stride_height) - pad_height;
          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
          {
            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
            {
              for (int out_channel = 0; out_channel < output_depth; ++out_channel)
              {
                // Compute output element location
                const int out_x = out_x_origin + filter_x;
                const int out_y = out_y_origin + filter_y;
                // We cannot accumulate out of bounds
                if ((out_x >= 0) && (out_x < output_width) && (out_y >= 0) &&
                    (out_y < output_height))
                {
                  float input_value =
                    input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
                  float filter_value =
                    filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
                  output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
                    input_value * filter_value;
                }
              }
            }
          }
        }
      }
    }
  }
}

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PaddingValues::height, MatchingDim(), Offset(), output_shape, nnfw::cker::TransposeConvParams::padding_values, nnfw::cker::TransposeConvParams::stride_height, nnfw::cker::TransposeConvParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ TransposeImpl()

template<typename T >

void nnfw::cker::TransposeImpl	(	const TransposeParams &	params,
		const Shape &	input_shape,
		const T *	input_data,
		const Shape &	output_shape,
		T *	output_data
	)

Definition at line 475 of file Transpose.h.

{
  const int dims_cnt = input_shape.DimensionsCount();
 
  int dim0, dim1;
  if (IsTranspose2DApplicable(params, input_shape, &dim0, &dim1))
  {
    Transpose2D(Shape({dim0, dim1}), input_data, Shape({dim1, dim0}), output_data);
    return;
  }
 
  // TODO(b/141217325): notably Eigen is better suited for
  // larger inputs whereas Transpose3D is generally
  // better for smaller ones.
  //
  // E.g. on Nexus 5, Eigen is better for size 96^3 and up
  // and Transpose3D is better for 72^3 and down.
  //
  // 96^3 is not mobile-friendly for certain usecases
  // (e.g. model used in beam search for seq2seq) but is in others.
  // Consider tradeoffs.
  if (dims_cnt == 3)
  {
    Transpose3D(params, input_shape, input_data, output_shape, output_data);
    return;
  }
 
  // Reroute to the reference version if an optimized method for the given data
  // is not available.
  reference::Transpose(params, input_shape, input_data, output_shape, output_data);
}

References nnfw::cker::Shape::DimensionsCount(), output_shape, nnfw::cker::reference::Transpose(), Transpose2D(), and Transpose3D().

Referenced by Transpose().

◆ Unpack()

template<typename Scalar >

void nnfw::cker::Unpack	(	const UnpackParams &	params,
		const Shape &	input_shape,
		const Scalar *	input_data,
		const Shape &	output_shape,
		Scalar const	output_datas
	)

Definition at line 30 of file Unpack.h.

{
  const int dimensions = input_shape.DimensionsCount();
  const int outputs_count = params.num_split;
 
  int outer_size = 1;
  for (int i = 0; i < params.axis; i++)
  {
    outer_size *= input_shape.Dims(i);
  }
  int copy_size = 1;
  for (int i = params.axis + 1; i < dimensions; i++)
  {
    copy_size *= input_shape.Dims(i);
  }
  assert(output_shape.FlatSize() == copy_size * outer_size);
 
  for (int i = 0; i < outputs_count; ++i)
  {
    for (int k = 0; k < outer_size; k++)
    {
      Scalar *output_ptr = output_datas[i] + copy_size * k;
      int loc = k * outputs_count * copy_size + i * copy_size;
      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
    }
  }
}

References nnfw::cker::UnpackParams::axis, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::UnpackParams::num_split, and output_shape.

◆ UpdateLstmCellFloat()

void nnfw::cker::UpdateLstmCellFloat	(	int	n_batch,
		int	n_cell,
		float *	cell_state,
		const float *	input_gate,
		float *	forget_gate,
		const float *	cell_gate,
		bool	use_cifg,
		float	clip
	)

Definition at line 135 of file LSTM.h.

{
  // Define variable for 4th argument to avoid warning
  // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
  const float *cwise_product_rhs = cell_state;
  VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state);
 
  if (use_cifg)
  {
    // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
    // scratch, as input_gate array is not allocated in this case. (Be careful
    // not to write to the scratch before reading the forget gate data.)
    float *scratch = forget_gate;
    Sub1Vector(forget_gate, n_batch * n_cell, scratch);
    VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state);
  }
  else
  {
    VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state);
  }
  if (clip > 0.0f)
  {
    CwiseClipping(cell_state, n_batch * n_cell, clip);
  }
}

References CwiseClipping(), Sub1Vector(), VectorVectorCwiseProduct(), and VectorVectorCwiseProductAccumulate().

Referenced by LstmStepFloat().

◆ ValidateGemmParams()

template<typename AccumScalar , typename DstScalar , QuantizationFlavor quantization_flavor>

void nnfw::cker::ValidateGemmParams ( const GemmParams< AccumScalar, DstScalar, quantization_flavor > & params )

Definition at line 544 of file Types.h.

{
  // Guard consistency of the quantized multiplier fields.
  if constexpr (quantization_flavor == QuantizationFlavor::kFloatingPoint)
  {
    assert(!params.multiplier_fixedpoint);
    assert(!params.multiplier_exponent);
    assert(!params.multiplier_fixedpoint_perchannel);
    assert(!params.multiplier_exponent_perchannel);
  }
  else if constexpr (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier &&
                     !std::is_same_v<DstScalar, int32_t>)
  {
    assert(params.multiplier_fixedpoint);
    // Nothing to check about multiplier_exponent
    assert(!params.multiplier_fixedpoint_perchannel);
    assert(!params.multiplier_exponent_perchannel);
  }
  else if constexpr (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier &&
                     !std::is_same_v<DstScalar, int32_t>)
  {
    assert(!params.multiplier_fixedpoint);
    assert(!params.multiplier_exponent);
    assert(params.multiplier_fixedpoint_perchannel);
    assert(params.multiplier_exponent_perchannel);
  }
  else
  {
    // For the get raw accumulator case, we should make sure none of the
    // quantization params are set.
    assert(!params.multiplier_fixedpoint);
    assert(!params.multiplier_exponent);
    assert(!params.multiplier_fixedpoint_perchannel);
    assert(!params.multiplier_exponent_perchannel);
  }
}

References kFloatingPoint, kIntegerWithPerRowMultiplier, and kIntegerWithUniformMultiplier.

◆ VectorBatchVectorAdd()

void nnfw::cker::VectorBatchVectorAdd	(	const float *	vector,
		int	v_size,
		int	n_batch,
		float *	batch_vector
	)

inline

Definition at line 39 of file TensorUtils.h.

{
  PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
}

References PortableVectorBatchVectorAdd().

Referenced by CalculateLstmGateFloat().

◆ VectorBatchVectorAssign()

void nnfw::cker::VectorBatchVectorAssign	(	const float *	vector,
		int	v_size,
		int	n_batch,
		float *	batch_vector
	)

inline

Definition at line 44 of file TensorUtils.h.

{
  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
}

References PortableVectorBatchVectorAssign().

Referenced by CalculateLstmGateFloat(), CalculateLstmOutputFloat(), FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().

◆ VectorBatchVectorCwiseProduct()

template<typename T >

void nnfw::cker::VectorBatchVectorCwiseProduct	(	const T *	vector,
		int	v_size,
		const T *	batch_vector,
		int	n_batch,
		T *	result
	)

inline

Definition at line 76 of file TensorUtils.h.

{
  for (int b = 0; b < n_batch; b++)
  {
    VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
    // Update the pointers.
    result += v_size;
    batch_vector += v_size;
  }
}

References VectorVectorCwiseProduct().

Referenced by CalculateLstmGateFloat().

◆ VectorBatchVectorCwiseProductAccumulate()

template<typename T >

void nnfw::cker::VectorBatchVectorCwiseProductAccumulate	(	const T *	vector,
		int	v_size,
		const T *	batch_vector,
		int	n_batch,
		T *	result
	)

inline

Definition at line 92 of file TensorUtils.h.

{
  for (int b = 0; b < n_batch; b++)
  {
    VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
    // Update the pointers.
    result += v_size;
    batch_vector += v_size;
  }
}

References VectorVectorCwiseProductAccumulate().

Referenced by CalculateLstmGateFloat().

◆ VectorVectorCwiseProduct()

template<typename T >

void nnfw::cker::VectorVectorCwiseProduct	(	const T *__restrict__	vector1,
		const T *__restrict__	vector2,
		int	v_size,
		T *__restrict__	result
	)

inline

Definition at line 52 of file TensorUtils.h.

{
  for (int v = 0; v < v_size; v++)
  {
    *result++ = *vector1++ * *vector2++;
  }
}

Referenced by CalculateLstmOutputFloat(), UpdateLstmCellFloat(), and VectorBatchVectorCwiseProduct().

◆ VectorVectorCwiseProductAccumulate()

template<typename T >

void nnfw::cker::VectorVectorCwiseProductAccumulate	(	const T *__restrict__	vector1,
		const T *__restrict__	vector2,
		int	v_size,
		T *__restrict__	result
	)

inline

Definition at line 64 of file TensorUtils.h.

{
  for (int v = 0; v < v_size; v++)
  {
    *result++ += *vector1++ * *vector2++;
  }
}

Referenced by UpdateLstmCellFloat(), and VectorBatchVectorCwiseProductAccumulate().

◆ ZeroVector()

void nnfw::cker::ZeroVector	(	float *	vector,
		int	v_size
	)

inline

Definition at line 160 of file TensorUtils.h.

160{ PortableZeroVector(vector, v_size); }

nnfw::cker::PortableZeroVector

void PortableZeroVector(float *vector, int v_size)

Definition PortableTensorUtils.h:293

References PortableZeroVector().

Referenced by FullyConnected(), FullyConnectedHybrid(), FullyConnectedSparseWeight16x1(), and FullyConnectedSparseWeightRandom().

Namespaces

Data Structures

Typedefs

Enumerations

Functions

Typedef Documentation

◆ ComparisonFn

◆ CPUDevice

◆ Index32

◆ LabelCounts

◆ Labels

◆ LabelToDimSizes

◆ MatrixMap

◆ OperandLabelCounts

◆ OperandLabels

◆ ShapeVec

◆ VectorMap

Enumeration Type Documentation

◆ BinaryArithmeticOpType

◆ BroadcastableOpCategory

◆ CachePolicy

◆ ComparisonOpType

◆ DimensionType

◆ FusedActivationFunctionType

◆ LSTMKernelType

◆ Order

◆ PaddingType

◆ QuantizationFlavor

◆ RoPEMode

Function Documentation

◆ Abs()

◆ ActivationFunctionWithMinMax()

◆ AddN()

◆ ApplyActivationToVector()

◆ ArgMinMax()

◆ AveragePool()

◆ AveragePool16()

◆ AveragePool32()

◆ AveragePool< float >()

◆ AveragePool< int8_t >()

◆ AveragePool< uint8_t >()

◆ BatchToSpaceND()

◆ begin()

◆ BiasAndClamp()

◆ BinaryArithmeticOp() [1/4]

◆ BinaryArithmeticOp() [2/4]

◆ BinaryArithmeticOp() [3/4]

◆ BinaryArithmeticOp() [4/4]

◆ BroadcastBinaryArithmeticOp() [1/3]

◆ BroadcastBinaryArithmeticOp() [2/3]

◆ BroadcastBinaryArithmeticOp() [3/3]

◆ BroadcastComparison4DSlow()

◆ BroadcastComparison4DSlowImpl()

◆ BroadcastComparison4DSlowWithScaling()

◆ BroadcastSelect4DSlow()

◆ BroadcastTo()

◆ buildStridedSliceParams()

◆ CalculateLstmGateFloat()

◆ CalculateLstmOutputFloat()

◆ checkMatching()

◆ checkOutputSize()

◆ Clamp()

◆ Comparison()

◆ ComparisonImpl()

◆ ComparisonWithScaling()

◆ ComputeBatchIndices()

◆ ComputeInterpolationValues() [1/2]

◆ ComputeInterpolationValues() [2/2]

◆ Concatenation()

◆ ConcatenationWithScaling()

◆ CopyDimsToDesc()

◆ CopyMultipleTimes()

◆ Cos()

◆ CountLeadingZeros()

◆ CwiseClipping()

◆ DepthToSpace()

◆ DepthwiseConv()

◆ DepthwiseConvOp()

◆ Dequantize() [1/3]

◆ Dequantize() [2/3]