ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::optimized Namespace Reference

Namespaces

namespace  depthwise_conv
 

Data Structures

struct  BinaryOpActivationFloatMax
 
struct  BinaryOpActivationFloatMinMax
 
struct  BinaryOpActivationFloatNone
 
struct  BinaryOpFuncAddFloat
 
struct  BinaryOpFuncDivFloat
 
struct  BinaryOpFuncMulFloat
 
struct  BinaryOpFuncSubFloat
 
struct  BinaryOpFuncSwapArgs
 
struct  FloatDepthwiseConvKernel
 
struct  GemmlowpOutputPipeline
 

Typedefs

using BinaryOpImplFloatFuncs = std::pair< void(*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), void(*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>
 

Functions

template<typename ElementwiseF , typename ScalarBroadcastF , typename T >
void BinaryBroadcastFiveFold (const BinaryArithmeticOpParam &params, bool switch_inputs, const Shape &, const T *unswitched_input1_data, const Shape &, const T *unswitched_input2_data, const Shape &, T *output_data, ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
 
template<typename ElementwiseF , typename ScalarBroadcastF , typename T >
void BinaryBroadcastFiveFold (const BinaryArithmeticOpParam &unswitched_params, const Shape &, const T *unswitched_input1_data, const Shape &, const T *unswitched_input2_data, const Shape &, T *output_data, ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value, int32_t > quant8_sum (const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 
void AddElementwise (int size, const BinaryArithmeticOpParam &params, const uint8_t *input1_data, const uint8_t *input2_data, uint8_t *output_data)
 
void AddElementwise (int size, const BinaryArithmeticOpParam &params, const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
 
template<class OPERATOR , class ACTIVATION >
void BinaryOpElementwise (int size, const BinaryArithmeticOpParam &params, const float *input1_data, const float *input2_data, float *output_data)
 
template<class OPERATOR , class ACTIVATION >
void BinaryOpScalarBroadcast (int size, const BinaryArithmeticOpParam &params, const float broadcast_value, const float *input2_data, float *output_data)
 
template<class FUNC >
BinaryOpImplFloatFuncs getBinaryOpWithActivationImplFloat (const BinaryArithmeticOpParam &params)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value > Add (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
void Add (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void AddScalarBroadcast (int size, const BinaryArithmeticOpParam &params, uint8_t broadcast_value, const uint8_t *input2_data, uint8_t *output_data)
 
void AddScalarBroadcast (int size, const BinaryArithmeticOpParam &params, int8_t input1_data, const int8_t *input2_data, int8_t *output_data)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value > BroadcastAddDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
void BroadcastAddDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void Sub (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void BroadcastSubDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value, int32_t > quant8_mul (const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 
void MulElementwise (int size, const BinaryArithmeticOpParam &params, const uint8_t *input1_data, const uint8_t *input2_data, uint8_t *output_data)
 
void MulElementwise (int size, const BinaryArithmeticOpParam &params, const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value > Mul (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
void Mul (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void MulSimpleBroadcast (int size, const BinaryArithmeticOpParam &params, const uint8_t broadcast_value, const uint8_t *input2_data, uint8_t *output_data)
 
void MulSimpleBroadcast (int size, const BinaryArithmeticOpParam &params, const int8_t broadcast_value, const int8_t *input2_data, int8_t *output_data)
 
template<typename T >
std::enable_if_t< is_quant8< T >::value > BroadcastMulDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 
void BroadcastMulDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void Div (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void BroadcastDivDispatch (const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data)
 
void AddBiasAndEvalActivationFunction (float output_activation_min, float output_activation_max, const Shape &bias_shape, const float *bias_data, const Shape &array_shape, float *array_data)
 
void Conv (const ConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, const Shape &im2col_shape, uint8_t *im2col_data)
 
template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void FloatDepthwiseConvAccumRow (int stride, int dilation_factor, int input_depth, int input_width, const float *input_data, int pad_width, int depth_multiplier, int filter_width, const float *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, float *acc_buffer)
 
void FloatDepthwiseConvAccumRowGeneric (int stride, int dilation_factor, int input_depth, int input_width, const float *input_data, int pad_width, int depth_multiplier, int filter_width, const float *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, float *acc_buffer)
 
void DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const float *bias_data, float *acc_buffer)
 
void DepthwiseConvImpl (const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, int thread_start, int thread_end, int thread_dim)
 
void DepthwiseConvWithRounding (const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
 
void DepthwiseConvImpl (const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
 
template<typename T >
void ExtractPatchIntoBufferColumn (const Shape &input_shape, int w, int h, int b, int kheight, int kwidth, int stride_width, int stride_height, int pad_width, int pad_height, int in_width, int in_height, int in_depth, int single_buffer_length, int buffer_id, const T *in_data, T *conv_buffer_data, uint8_t zero_byte)
 
template<typename T >
void DilatedIm2col (const ConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const Shape &output_shape, T *im2col_data, const int32_t *zero_bytes, const int zero_bytes_len)
 
template<typename T >
void DilatedIm2col (const ConvParams &params, uint8_t zero_byte, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const Shape &output_shape, T *im2col_data)
 
template<typename T >
void Im2col (const ConvParams &params, int kheight, int kwidth, uint8_t zero_byte, const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
 

Variables

std::mutex _gemmlowp_mutex
 

Typedef Documentation

◆ BinaryOpImplFloatFuncs

using nnfw::cker::optimized::BinaryOpImplFloatFuncs = typedef std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>

Definition at line 670 of file BinaryArithmeticOps.h.

Function Documentation

◆ Add() [1/2]

void nnfw::cker::optimized::Add ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 699 of file BinaryArithmeticOps.h.

702{
703 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
704 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
705 (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
706}
const luci_interpreter::RuntimeShape output_shape
int MatchingElementsSize(const Shape &shape, const Shape &check_shape_0, const Shape &check_shape_1)
Definition Shape.h:333

References nnfw::cker::MatchingElementsSize(), and output_shape.

◆ Add() [2/2]

template<typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::optimized::Add ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 692 of file BinaryArithmeticOps.h.

694{
695 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
696 AddElementwise(flat_size, params, input1_data, input2_data, output_data);
697}
void AddElementwise(int size, const BinaryArithmeticOpParam &params, const uint8_t *input1_data, const uint8_t *input2_data, uint8_t *output_data)

References AddElementwise(), nnfw::cker::MatchingElementsSize(), and output_shape.

Referenced by nnfw::cker::BinaryArithmeticOp(), and nnfw::cker::BinaryArithmeticOp().

◆ AddBiasAndEvalActivationFunction()

void nnfw::cker::optimized::AddBiasAndEvalActivationFunction ( float  output_activation_min,
float  output_activation_max,
const Shape bias_shape,
const float *  bias_data,
const Shape array_shape,
float *  array_data 
)
inline

Definition at line 74 of file Conv.h.

78{
79 BiasAndClamp(output_activation_min, output_activation_max, bias_shape.FlatSize(), bias_data,
80 array_shape.FlatSize(), array_data);
81}
int FlatSize() const
Definition Shape.h:181
void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const float *bias_data, int array_size, float *array_data)
Definition Common.h:29

References nnfw::cker::BiasAndClamp(), and nnfw::cker::Shape::FlatSize().

Referenced by nnfw::cker::multithreaded::Conv().

◆ AddElementwise() [1/2]

void nnfw::cker::optimized::AddElementwise ( int  size,
const BinaryArithmeticOpParam params,
const int8_t *  input1_data,
const int8_t *  input2_data,
int8_t *  output_data 
)
inline

Definition at line 322 of file BinaryArithmeticOps.h.

325{
326 int i = 0;
327#ifdef USE_NEON
328 const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
329 const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
330
331 const int input1_left_shift = params.left_shift + params.input1_shift;
332 const int input2_left_shift = params.left_shift + params.input2_shift;
333 const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
334 const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
335
336 const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
337 const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
338
339 for (; i <= size - 16; i += 16)
340 {
341 const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
342 const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
343
344 const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
345 const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
346
347 const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
348 const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
349 const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup);
350 const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup);
351 const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup);
352 const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup);
353 const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
354 const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
355 const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
356 const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
357 const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
358 const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
359 const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
360 const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
361 int32x4_t x111 = vmovl_s16(input1_val_low_low);
362 int32x4_t x112 = vmovl_s16(input1_val_low_high);
363 int32x4_t x121 = vmovl_s16(input1_val_high_low);
364 int32x4_t x122 = vmovl_s16(input1_val_high_high);
365 int32x4_t x211 = vmovl_s16(input2_val_low_low);
366 int32x4_t x212 = vmovl_s16(input2_val_low_high);
367 int32x4_t x221 = vmovl_s16(input2_val_high_low);
368 int32x4_t x222 = vmovl_s16(input2_val_high_high);
369
370 x111 = vshlq_s32(x111, input1_left_dup);
371 x112 = vshlq_s32(x112, input1_left_dup);
372 x121 = vshlq_s32(x121, input1_left_dup);
373 x122 = vshlq_s32(x122, input1_left_dup);
374 x211 = vshlq_s32(x211, input2_left_dup);
375 x212 = vshlq_s32(x212, input2_left_dup);
376 x221 = vshlq_s32(x221, input2_left_dup);
377 x222 = vshlq_s32(x222, input2_left_dup);
378 x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
379 x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
380 x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
381 x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
382 x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
383 x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
384 x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
385 x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
386 int32x4_t s11 = vaddq_s32(x111, x211);
387 int32x4_t s12 = vaddq_s32(x112, x212);
388 int32x4_t s21 = vaddq_s32(x121, x221);
389 int32x4_t s22 = vaddq_s32(x122, x222);
390 s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
391 s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
392 s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
393 s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
394 using gemmlowp::RoundingDivideByPOT;
395 s11 = RoundingDivideByPOT(s11, -params.output_shift);
396 s12 = RoundingDivideByPOT(s12, -params.output_shift);
397 s21 = RoundingDivideByPOT(s21, -params.output_shift);
398 s22 = RoundingDivideByPOT(s22, -params.output_shift);
399 const int16x4_t s11_narrowed = vmovn_s32(s11);
400 const int16x4_t s12_narrowed = vmovn_s32(s12);
401 const int16x4_t s21_narrowed = vmovn_s32(s21);
402 const int16x4_t s22_narrowed = vmovn_s32(s22);
403 const int16x8_t s1 =
404 vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset));
405 const int16x8_t s2 =
406 vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset));
407 const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
408
409 const int8x16_t clamped =
410 vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s));
411 vst1q_s8(output_data + i, clamped);
412 }
413#endif // NEON
414
415 for (; i < size; ++i)
416 {
417 const int32_t input1_val = params.input1_offset + input1_data[i];
418 const int32_t input2_val = params.input2_offset + input2_data[i];
419 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
420 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
421 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
422 shifted_input1_val, params.input1_multiplier, params.input1_shift);
423 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
424 shifted_input2_val, params.input2_multiplier, params.input2_shift);
425 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
426 const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
427 raw_sum, params.output_multiplier, params.output_shift) +
428 params.output_offset;
429 const int32_t clamped_output = std::min(params.quantized_activation_max,
430 std::max(params.quantized_activation_min, raw_output));
431 output_data[i] = static_cast<int8_t>(clamped_output);
432 }
433}
int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int32_t quantized_multiplier, int left_shift)
Definition Utils.h:111
int32_t size[5]
Definition Slice.cpp:35

References nnfw::cker::BinaryArithmeticOpParam::input1_multiplier, nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input1_shift, nnfw::cker::BinaryArithmeticOpParam::input2_multiplier, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::BinaryArithmeticOpParam::input2_shift, nnfw::cker::BinaryArithmeticOpParam::left_shift, nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

◆ AddElementwise() [2/2]

void nnfw::cker::optimized::AddElementwise ( int  size,
const BinaryArithmeticOpParam params,
const uint8_t *  input1_data,
const uint8_t *  input2_data,
uint8_t *  output_data 
)
inline

Definition at line 246 of file BinaryArithmeticOps.h.

249{
250 int i = 0;
251
252#ifdef USE_NEON
253 const uint8x8_t output_activation_min_vector = vdup_n_u8(params.quantized_activation_min);
254 const uint8x8_t output_activation_max_vector = vdup_n_u8(params.quantized_activation_max);
255 for (; i <= size - 8; i += 8)
256 {
257 const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
258 const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
259 const int16x8_t input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
260 const int16x8_t input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
261 const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
262 const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
263 const int16x4_t input1_val_high = vget_high_s16(input1_val);
264 const int16x4_t input1_val_low = vget_low_s16(input1_val);
265 const int16x4_t input2_val_high = vget_high_s16(input2_val);
266 const int16x4_t input2_val_low = vget_low_s16(input2_val);
267 int32x4_t x11 = vmovl_s16(input1_val_low);
268 int32x4_t x12 = vmovl_s16(input1_val_high);
269 int32x4_t x21 = vmovl_s16(input2_val_low);
270 int32x4_t x22 = vmovl_s16(input2_val_high);
271 const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
272 x11 = vshlq_s32(x11, left_shift_dup);
273 x12 = vshlq_s32(x12, left_shift_dup);
274 x21 = vshlq_s32(x21, left_shift_dup);
275 x22 = vshlq_s32(x22, left_shift_dup);
276 x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
277 x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
278 x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
279 x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
280 const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
281 const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
282 x11 = vshlq_s32(x11, input1_shift_dup);
283 x12 = vshlq_s32(x12, input1_shift_dup);
284 x21 = vshlq_s32(x21, input2_shift_dup);
285 x22 = vshlq_s32(x22, input2_shift_dup);
286 int32x4_t s1 = vaddq_s32(x11, x21);
287 int32x4_t s2 = vaddq_s32(x12, x22);
288 s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
289 s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
290 using gemmlowp::RoundingDivideByPOT;
291 s1 = RoundingDivideByPOT(s1, -params.output_shift);
292 s2 = RoundingDivideByPOT(s2, -params.output_shift);
293 const int16x4_t s1_narrowed = vmovn_s32(s1);
294 const int16x4_t s2_narrowed = vmovn_s32(s2);
295 const int16x8_t s =
296 vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
297 const uint8x8_t clamped =
298 vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
299 vst1_u8(output_data + i, clamped);
300 }
301#endif // NEON
302 for (; i < size; ++i)
303 {
304 const int32_t input1_val = params.input1_offset + input1_data[i];
305 const int32_t input2_val = params.input2_offset + input2_data[i];
306 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
307 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
308 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
309 shifted_input1_val, params.input1_multiplier, params.input1_shift);
310 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
311 shifted_input2_val, params.input2_multiplier, params.input2_shift);
312 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
313 const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
314 raw_sum, params.output_multiplier, params.output_shift) +
315 params.output_offset;
316 const int32_t clamped_output = std::min(params.quantized_activation_max,
317 std::max(params.quantized_activation_min, raw_output));
318 output_data[i] = static_cast<uint8_t>(clamped_output);
319 }
320}

References nnfw::cker::BinaryArithmeticOpParam::input1_multiplier, nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input1_shift, nnfw::cker::BinaryArithmeticOpParam::input2_multiplier, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::BinaryArithmeticOpParam::input2_shift, nnfw::cker::BinaryArithmeticOpParam::left_shift, nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

Referenced by Add(), and BroadcastAddDispatch().

◆ AddScalarBroadcast() [1/2]

void nnfw::cker::optimized::AddScalarBroadcast ( int  size,
const BinaryArithmeticOpParam params,
int8_t  input1_data,
const int8_t *  input2_data,
int8_t *  output_data 
)
inline

Definition at line 727 of file BinaryArithmeticOps.h.

729{
730 using gemmlowp::RoundingDivideByPOT;
731 int i = 0;
732#ifdef USE_NEON
733 const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
734 const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min);
735 const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max);
736
737 // Process broadcast scalar.
738 const int8x8_t input1_val_original = vdup_n_s8(input1_data);
739 const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
740 const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
741 const int16x4_t input1_val_high = vget_high_s16(input1_val);
742 const int16x4_t input1_val_low = vget_low_s16(input1_val);
743 int32x4_t x11 = vmovl_s16(input1_val_low);
744 int32x4_t x12 = vmovl_s16(input1_val_high);
745 x11 = vshlq_s32(x11, left_shift_dup);
746 x12 = vshlq_s32(x12, left_shift_dup);
747 x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
748 x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
749 const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
750 x11 = vshlq_s32(x11, input1_shift_dup);
751 x12 = vshlq_s32(x12, input1_shift_dup);
752
753 for (; i <= size - 8; i += 8)
754 {
755 const int8x8_t input2_val_original = vld1_s8(input2_data + i);
756 const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
757 const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
758 const int16x4_t input2_val_high = vget_high_s16(input2_val);
759 const int16x4_t input2_val_low = vget_low_s16(input2_val);
760 int32x4_t x21 = vmovl_s16(input2_val_low);
761 int32x4_t x22 = vmovl_s16(input2_val_high);
762 x21 = vshlq_s32(x21, left_shift_dup);
763 x22 = vshlq_s32(x22, left_shift_dup);
764 x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
765 x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
766 const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
767 x21 = vshlq_s32(x21, input2_shift_dup);
768 x22 = vshlq_s32(x22, input2_shift_dup);
769 int32x4_t s1 = vaddq_s32(x11, x21);
770 int32x4_t s2 = vaddq_s32(x12, x22);
771 s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
772 s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
773 s1 = RoundingDivideByPOT(s1, -params.output_shift);
774 s2 = RoundingDivideByPOT(s2, -params.output_shift);
775 const int16x4_t s1_narrowed = vmovn_s32(s1);
776 const int16x4_t s2_narrowed = vmovn_s32(s2);
777 const int16x8_t s =
778 vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
779 const int8x8_t clamped =
780 vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
781 vst1_s8(output_data + i, clamped);
782 }
783#endif // NEON
784
785 if (i < size)
786 {
787 // Process broadcast scalar.
788 const int32_t input1_val = params.input1_offset + input1_data;
789 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
790 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
791 shifted_input1_val, params.input1_multiplier, params.input1_shift);
792
793 for (; i < size; ++i)
794 {
795 const int32_t input2_val = params.input2_offset + input2_data[i];
796 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
797 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
798 shifted_input2_val, params.input2_multiplier, params.input2_shift);
799 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
800 const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
801 raw_sum, params.output_multiplier, params.output_shift) +
802 params.output_offset;
803 const int32_t clamped_output = std::min(
804 params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output));
805 output_data[i] = static_cast<int8_t>(clamped_output);
806 }
807 }
808}

References nnfw::cker::BinaryArithmeticOpParam::input1_multiplier, nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input1_shift, nnfw::cker::BinaryArithmeticOpParam::input2_multiplier, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::BinaryArithmeticOpParam::input2_shift, nnfw::cker::BinaryArithmeticOpParam::left_shift, nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

◆ AddScalarBroadcast() [2/2]

void nnfw::cker::optimized::AddScalarBroadcast ( int  size,
const BinaryArithmeticOpParam params,
uint8_t  broadcast_value,
const uint8_t *  input2_data,
uint8_t *  output_data 
)
inline

Definition at line 711 of file BinaryArithmeticOps.h.

714{
715 int i = 0;
716 int32_t clamped_output;
717 for (; i < size; ++i)
718 {
719 clamped_output = quant8_sum(params, broadcast_value, input2_data[i]);
720 output_data[i] = static_cast<uint8_t>(clamped_output);
721 }
722}
std::enable_if_t< is_quant8< T >::value, int32_t > quant8_sum(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)

References quant8_sum(), and size.

Referenced by BroadcastAddDispatch().

◆ BinaryBroadcastFiveFold() [1/2]

template<typename ElementwiseF , typename ScalarBroadcastF , typename T >
void nnfw::cker::optimized::BinaryBroadcastFiveFold ( const BinaryArithmeticOpParam params,
bool  switch_inputs,
const Shape ,
const T *  unswitched_input1_data,
const Shape ,
const T *  unswitched_input2_data,
const Shape ,
T *  output_data,
ElementwiseF  elementwise_f,
ScalarBroadcastF  scalar_broadcast_f 
)
inline

Definition at line 40 of file BinaryArithmeticOps.h.

47{
48 const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data;
49 const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data;
50
51 // Fivefold nested loops. The second input resets its position for each
52 // iteration of the second loop. The first input resets its position at the
53 // beginning of the fourth loop. The innermost loop is an elementwise add of
54 // sections of the arrays.
55 T *output_data_ptr = output_data;
56 const T *input1_data_ptr = input1_data;
57 const T *input2_data_reset = input2_data;
58 // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
59 // between input shapes. y3 for input 1 is always broadcast, and so the
60 // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
61 // Put another way,
62 // input1.shape.FlatSize = y0 * y1 * y2 * y4,
63 // input2.shape.FlatSize = y0 * y2 * y3 * y4.
64 int y0 = params.broadcast_shape[0];
65 int y1 = params.broadcast_shape[1];
66 int y2 = params.broadcast_shape[2];
67 int y3 = params.broadcast_shape[3];
68 int y4 = params.broadcast_shape[4];
69 if (y4 > 1)
70 {
71 // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
72 // dimension.
73 for (int i0 = 0; i0 < y0; ++i0)
74 {
75 const T *input2_data_ptr = nullptr;
76 for (int i1 = 0; i1 < y1; ++i1)
77 {
78 input2_data_ptr = input2_data_reset;
79 for (int i2 = 0; i2 < y2; ++i2)
80 {
81 for (int i3 = 0; i3 < y3; ++i3)
82 {
83 elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
84 input2_data_ptr += y4;
85 output_data_ptr += y4;
86 }
87 // We have broadcast y4 of input1 data y3 times, and now move on.
88 input1_data_ptr += y4;
89 }
90 }
91 // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
92 input2_data_reset = input2_data_ptr;
93 }
94 }
95 else
96 {
97 // Special case of y4 == 1, in which the innermost loop is a single element
98 // and can be combined with the next (y3) as an inner broadcast.
99 //
100 // Note that this handles the case of pure scalar broadcast when
101 // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
102 // broadcast with batch (as y2 > 1).
103 //
104 // NOTE The process is the same as the above general case except simplified
105 // for y4 == 1 and the loop over y3 is contained within the
106 // AddScalarBroadcast function.
107 for (int i0 = 0; i0 < y0; ++i0)
108 {
109 const T *input2_data_ptr = nullptr;
110 for (int i1 = 0; i1 < y1; ++i1)
111 {
112 input2_data_ptr = input2_data_reset;
113 for (int i2 = 0; i2 < y2; ++i2)
114 {
115 scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
116 input2_data_ptr += y3;
117 output_data_ptr += y3;
118 input1_data_ptr += 1;
119 }
120 }
121 input2_data_reset = input2_data_ptr;
122 }
123 }
124}

References nnfw::cker::BinaryArithmeticOpParam::broadcast_shape.

Referenced by BroadcastAddDispatch(), BroadcastAddDispatch(), BroadcastDivDispatch(), BroadcastMulDispatch(), BroadcastMulDispatch(), and BroadcastSubDispatch().

◆ BinaryBroadcastFiveFold() [2/2]

template<typename ElementwiseF , typename ScalarBroadcastF , typename T >
void nnfw::cker::optimized::BinaryBroadcastFiveFold ( const BinaryArithmeticOpParam unswitched_params,
const Shape ,
const T *  unswitched_input1_data,
const Shape ,
const T *  unswitched_input2_data,
const Shape ,
T *  output_data,
ElementwiseF  elementwise_f,
ScalarBroadcastF  scalar_broadcast_f 
)
inline

Definition at line 128 of file BinaryArithmeticOps.h.

135{
136 BinaryArithmeticOpParam switched_params = unswitched_params;
137 switched_params.input1_offset = unswitched_params.input2_offset;
138 switched_params.input1_multiplier = unswitched_params.input2_multiplier;
139 switched_params.input1_shift = unswitched_params.input2_shift;
140 switched_params.input2_offset = unswitched_params.input1_offset;
141 switched_params.input2_multiplier = unswitched_params.input1_multiplier;
142 switched_params.input2_shift = unswitched_params.input1_shift;
143
144 const bool use_unswitched =
145 unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
146
147 const BinaryArithmeticOpParam &params = use_unswitched ? unswitched_params : switched_params;
148 const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
149 const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
150
151 // Fivefold nested loops. The second input resets its position for each
152 // iteration of the second loop. The first input resets its position at the
153 // beginning of the fourth loop. The innermost loop is an elementwise add of
154 // sections of the arrays.
155 T *output_data_ptr = output_data;
156 const T *input1_data_ptr = input1_data;
157 const T *input2_data_reset = input2_data;
158 // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
159 // between input shapes. y3 for input 1 is always broadcast, and so the
160 // dimension there is 1, whereas optionally y1 might be broadcast for
161 // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
162 // input2.shape.FlatSize = y0 * y2 * y3 * y4.
163 int y0 = params.broadcast_shape[0];
164 int y1 = params.broadcast_shape[1];
165 int y2 = params.broadcast_shape[2];
166 int y3 = params.broadcast_shape[3];
167 int y4 = params.broadcast_shape[4];
168 if (y4 > 1)
169 {
170 // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
171 // dimension.
172 for (int i0 = 0; i0 < y0; ++i0)
173 {
174 const T *input2_data_ptr = nullptr;
175 for (int i1 = 0; i1 < y1; ++i1)
176 {
177 input2_data_ptr = input2_data_reset;
178 for (int i2 = 0; i2 < y2; ++i2)
179 {
180 for (int i3 = 0; i3 < y3; ++i3)
181 {
182 elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
183 input2_data_ptr += y4;
184 output_data_ptr += y4;
185 }
186 // We have broadcast y4 of input1 data y3 times, and now move on.
187 input1_data_ptr += y4;
188 }
189 }
190 // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
191 input2_data_reset = input2_data_ptr;
192 }
193 }
194 else
195 {
196 // Special case of y4 == 1, in which the innermost loop is a single
197 // element and can be combined with the next (y3) as an inner broadcast.
198 //
199 // Note that this handles the case of pure scalar broadcast when
200 // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
201 // broadcast with batch (as y2 > 1).
202 //
203 // NOTE The process is the same as the above general case except
204 // simplified for y4 == 1 and the loop over y3 is contained within the
205 // AddScalarBroadcast function.
206 for (int i0 = 0; i0 < y0; ++i0)
207 {
208 const T *input2_data_ptr = nullptr;
209 for (int i1 = 0; i1 < y1; ++i1)
210 {
211 input2_data_ptr = input2_data_reset;
212 for (int i2 = 0; i2 < y2; ++i2)
213 {
214 scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
215 input2_data_ptr += y3;
216 output_data_ptr += y3;
217 input1_data_ptr += 1;
218 }
219 }
220 input2_data_reset = input2_data_ptr;
221 }
222 }
223}
BroadcastableOpCategory broadcast_category
Definition Types.h:181

References nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::BinaryArithmeticOpParam::broadcast_shape, nnfw::cker::BinaryArithmeticOpParam::input1_multiplier, nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input1_shift, nnfw::cker::BinaryArithmeticOpParam::input2_multiplier, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::BinaryArithmeticOpParam::input2_shift, and nnfw::cker::kFirstInputBroadcastsFast.

◆ BinaryOpElementwise()

template<class OPERATOR , class ACTIVATION >
void nnfw::cker::optimized::BinaryOpElementwise ( int  size,
const BinaryArithmeticOpParam params,
const float *  input1_data,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 562 of file BinaryArithmeticOps.h.

565{
566 int i = 0;
567
568#ifdef USE_NEON
569 const auto activation_min = vdupq_n_f32(params.float_activation_min);
570 const auto activation_max = vdupq_n_f32(params.float_activation_max);
571 for (; i <= size - 16; i += 16)
572 {
573 auto a10 = vld1q_f32(input1_data + i);
574 auto a11 = vld1q_f32(input1_data + i + 4);
575 auto a12 = vld1q_f32(input1_data + i + 8);
576 auto a13 = vld1q_f32(input1_data + i + 12);
577 auto a20 = vld1q_f32(input2_data + i);
578 auto a21 = vld1q_f32(input2_data + i + 4);
579 auto a22 = vld1q_f32(input2_data + i + 8);
580 auto a23 = vld1q_f32(input2_data + i + 12);
581 auto x0 = OPERATOR::calculate(a10, a20);
582 auto x1 = OPERATOR::calculate(a11, a21);
583 auto x2 = OPERATOR::calculate(a12, a22);
584 auto x3 = OPERATOR::calculate(a13, a23);
585 x0 = ACTIVATION::applyFloor(x0, activation_min);
586 x1 = ACTIVATION::applyFloor(x1, activation_min);
587 x2 = ACTIVATION::applyFloor(x2, activation_min);
588 x3 = ACTIVATION::applyFloor(x3, activation_min);
589 x0 = ACTIVATION::applyCeiling(x0, activation_max);
590 x1 = ACTIVATION::applyCeiling(x1, activation_max);
591 x2 = ACTIVATION::applyCeiling(x2, activation_max);
592 x3 = ACTIVATION::applyCeiling(x3, activation_max);
593 vst1q_f32(output_data + i, x0);
594 vst1q_f32(output_data + i + 4, x1);
595 vst1q_f32(output_data + i + 8, x2);
596 vst1q_f32(output_data + i + 12, x3);
597 }
598 for (; i <= size - 4; i += 4)
599 {
600 auto a1 = vld1q_f32(input1_data + i);
601 auto a2 = vld1q_f32(input2_data + i);
602 auto x = OPERATOR::calculate(a1, a2); // vaddq
603 auto x_clamped =
604 ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
605 vst1q_f32(output_data + i, x_clamped);
606 }
607#endif // USE_NEON
608 for (; i < size; i++)
609 {
610 auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
611 output_data[i] = ACTIVATION::applyCeiling(
612 ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
613 }
614}

References nnfw::cker::BinaryArithmeticOpParam::float_activation_max, nnfw::cker::BinaryArithmeticOpParam::float_activation_min, and size.

◆ BinaryOpScalarBroadcast()

template<class OPERATOR , class ACTIVATION >
void nnfw::cker::optimized::BinaryOpScalarBroadcast ( int  size,
const BinaryArithmeticOpParam params,
const float  broadcast_value,
const float *  input2_data,
float *  output_data 
)
inline

Definition at line 620 of file BinaryArithmeticOps.h.

623{
624 int i = 0;
625
626#ifdef USE_NEON
627 const auto activation_min = vdupq_n_f32(params.float_activation_min);
628 const auto activation_max = vdupq_n_f32(params.float_activation_max);
629 const auto broadcast_value_dup = vdupq_n_f32(broadcast_value);
630 for (; i <= size - 16; i += 16)
631 {
632 auto a20 = vld1q_f32(input2_data + i);
633 auto a21 = vld1q_f32(input2_data + i + 4);
634 auto a22 = vld1q_f32(input2_data + i + 8);
635 auto a23 = vld1q_f32(input2_data + i + 12);
636 auto x0 = OPERATOR::calculate(broadcast_value_dup, a20);
637 auto x1 = OPERATOR::calculate(broadcast_value_dup, a21);
638 auto x2 = OPERATOR::calculate(broadcast_value_dup, a22);
639 auto x3 = OPERATOR::calculate(broadcast_value_dup, a23);
640 x0 = ACTIVATION::applyFloor(x0, activation_min);
641 x1 = ACTIVATION::applyFloor(x1, activation_min);
642 x2 = ACTIVATION::applyFloor(x2, activation_min);
643 x3 = ACTIVATION::applyFloor(x3, activation_min);
644 x0 = ACTIVATION::applyCeiling(x0, activation_max);
645 x1 = ACTIVATION::applyCeiling(x1, activation_max);
646 x2 = ACTIVATION::applyCeiling(x2, activation_max);
647 x3 = ACTIVATION::applyCeiling(x3, activation_max);
648 vst1q_f32(output_data + i, x0);
649 vst1q_f32(output_data + i + 4, x1);
650 vst1q_f32(output_data + i + 8, x2);
651 vst1q_f32(output_data + i + 12, x3);
652 }
653 for (; i <= size - 4; i += 4)
654 {
655 auto a2 = vld1q_f32(input2_data + i);
656 auto x = OPERATOR::calculate(broadcast_value_dup, a2);
657 auto x_clamped =
658 ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
659 vst1q_f32(output_data + i, x_clamped);
660 }
661#endif // USE_NEON
662 for (; i < size; i++)
663 {
664 auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
665 output_data[i] = ACTIVATION::applyCeiling(
666 ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
667 }
668}

References nnfw::cker::BinaryArithmeticOpParam::float_activation_max, nnfw::cker::BinaryArithmeticOpParam::float_activation_min, and size.

◆ BroadcastAddDispatch() [1/2]

void nnfw::cker::optimized::BroadcastAddDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 835 of file BinaryArithmeticOps.h.

839{
840 if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
841 {
842 const std::function<float(const float &, const float &)> fn =
843 [](const float &a, const float &b) -> float { return a + b; };
844 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
845 input2_data, output_shape, output_data, fn);
846 }
847 else
848 {
849 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
850
851 BinaryBroadcastFiveFold(
852 params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
853 input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
854 implFuncs.first, implFuncs.second);
855 }
856}

References BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kGenericBroadcast, nnfw::cker::kSecondInputBroadcastsFast, and output_shape.

◆ BroadcastAddDispatch() [2/2]

template<typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::optimized::BroadcastAddDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 812 of file BinaryArithmeticOps.h.

815{
816 if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
817 {
818 const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
819 [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
820 return static_cast<T>(quant8_sum(params, a, b));
821 };
822 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
823 input2_data, output_shape, output_data, fn);
824 return;
825 }
826
827 BinaryBroadcastFiveFold(
828 params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
829 static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
830 AddElementwise),
831 static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
832 AddScalarBroadcast));
833}

References AddElementwise(), AddScalarBroadcast(), BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kGenericBroadcast, output_shape, and quant8_sum().

Referenced by nnfw::cker::BroadcastBinaryArithmeticOp(), and nnfw::cker::BroadcastBinaryArithmeticOp().

◆ BroadcastDivDispatch()

void nnfw::cker::optimized::BroadcastDivDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 1234 of file BinaryArithmeticOps.h.

1238{
1239#ifdef __aarch64__
1240 if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
1241 {
1242 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
1243 BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
1244 output_shape, output_data, implFuncs.first, implFuncs.second);
1245 }
1246 else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
1247 {
1248 auto implFuncs =
1249 getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
1250 BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
1251 output_shape, output_data, implFuncs.first, implFuncs.second);
1252 }
1253 else
1254#endif // __aarch64__
1255 {
1256 const std::function<float(const float &, const float &)> fn =
1257 [](const float &a, const float &b) -> float { return a / b; };
1258 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
1259 input2_data, output_shape, output_data, fn);
1260 }
1261}
void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs, const Shape &, const T *unswitched_input1_data, const Shape &, const T *unswitched_input2_data, const Shape &, T *output_data, ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)

References BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kFirstInputBroadcastsFast, nnfw::cker::kSecondInputBroadcastsFast, and output_shape.

Referenced by nnfw::cker::BroadcastBinaryArithmeticOp().

◆ BroadcastMulDispatch() [1/2]

void nnfw::cker::optimized::BroadcastMulDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 1199 of file BinaryArithmeticOps.h.

1203{
1204 if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
1205 {
1206 // TODO: Use GetBinaryArithmeticFn
1207 const std::function<float(const float &, const float &)> fn =
1208 [](const float &a, const float &b) -> float { return a * b; };
1209 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
1210 input2_data, output_shape, output_data, fn);
1211 return;
1212 }
1213 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
1214 BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data,
1215 output_shape, output_data, implFuncs.first, implFuncs.second);
1216}

References BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kGenericBroadcast, and output_shape.

◆ BroadcastMulDispatch() [2/2]

template<typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::optimized::BroadcastMulDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 1177 of file BinaryArithmeticOps.h.

1180{
1181 if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
1182 {
1183 const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
1184 [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
1185 return static_cast<T>(quant8_mul(params, a, b));
1186 };
1187 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
1188 input2_data, output_shape, output_data, fn);
1189 return;
1190 }
1191 BinaryBroadcastFiveFold(
1192 params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
1193 static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
1194 MulElementwise),
1195 static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
1196 MulSimpleBroadcast));
1197}
std::enable_if_t< is_quant8< T >::value, int32_t > quant8_mul(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)

References BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kGenericBroadcast, MulElementwise(), MulSimpleBroadcast(), output_shape, and quant8_mul().

Referenced by nnfw::cker::BroadcastBinaryArithmeticOp(), and nnfw::cker::BroadcastBinaryArithmeticOp().

◆ BroadcastSubDispatch()

void nnfw::cker::optimized::BroadcastSubDispatch ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 867 of file BinaryArithmeticOps.h.

871{
872 if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
873 {
874 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
875 BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
876 output_shape, output_data, implFuncs.first, implFuncs.second);
877 }
878 else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
879 {
880 auto implFuncs =
881 getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
882 BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
883 output_shape, output_data, implFuncs.first, implFuncs.second);
884 }
885 else
886 {
887 const std::function<float(const float &, const float &)> fn =
888 [](const float &a, const float &b) -> float { return a - b; };
889 reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
890 input2_data, output_shape, output_data, fn);
891 }
892}

References BinaryBroadcastFiveFold(), nnfw::cker::BinaryArithmeticOpParam::broadcast_category, nnfw::cker::reference::BroadcastBinaryArithmeticOpSlow(), nnfw::cker::kFirstInputBroadcastsFast, nnfw::cker::kSecondInputBroadcastsFast, and output_shape.

Referenced by nnfw::cker::BroadcastBinaryArithmeticOp().

◆ Conv()

void nnfw::cker::optimized::Conv ( const ConvParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape filter_shape,
const uint8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
uint8_t *  output_data,
const Shape im2col_shape,
uint8_t *  im2col_data 
)
inline

Definition at line 83 of file Conv.h.

88{
89 gemmlowp::GemmContext *gemm_context = gemm_support::GetGemmLowpContext();
90
91 const int stride_width = params.stride_width;
92 const int stride_height = params.stride_height;
93 const int dilation_width_factor = params.dilation_width_factor;
94 const int dilation_height_factor = params.dilation_height_factor;
95 const int32_t input_offset = params.input_offset;
96 const int32_t filter_offset = params.weights_offset;
97 const int32_t output_offset = params.output_offset;
98 const int32_t output_multiplier = params.output_multiplier;
99 const int output_shift = params.output_shift;
100 const int32_t output_activation_min = params.quantized_activation_min;
101 const int32_t output_activation_max = params.quantized_activation_max;
102 assert(input_shape.DimensionsCount() == 4);
103 assert(filter_shape.DimensionsCount() == 4);
104 assert(output_shape.DimensionsCount() == 4);
105
106 const uint8_t *gemm_input_data = nullptr;
107 const Shape *gemm_input_shape = nullptr;
108 const int filter_width = filter_shape.Dims(2);
109 const int filter_height = filter_shape.Dims(1);
110 const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
111 const bool need_im2col =
112 stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
113 if (need_dilated_im2col)
114 {
115 assert(im2col_data);
116 const int input_zero_point = -input_offset;
117 assert(input_zero_point >= 0);
118 assert(input_zero_point <= 255);
119 DilatedIm2col(params, input_zero_point, input_shape, input_data, filter_shape, output_shape,
120 im2col_data);
121 gemm_input_data = im2col_data;
122 gemm_input_shape = &im2col_shape;
123 }
124 else if (need_im2col)
125 {
126 assert(im2col_data);
127 const int input_zero_point = -input_offset;
128 assert(input_zero_point >= 0);
129 assert(input_zero_point <= 255);
130 Im2col(params, filter_height, filter_width, input_zero_point, input_shape, input_data,
131 im2col_shape, im2col_data);
132 gemm_input_data = im2col_data;
133 gemm_input_shape = &im2col_shape;
134 }
135 else
136 {
137 gemm_input_data = input_data;
138 gemm_input_shape = &input_shape;
139 }
140
141 const int gemm_input_rows = gemm_input_shape->Dims(3);
142 // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
143 // The root cause has not yet been identified though. Same applies below for
144 // the other calls commented out. This is a partial rollback of cl/196819423.
145 // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
146 const int gemm_input_cols =
147 gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
148 const int filter_rows = filter_shape.Dims(0);
149 // See b/79927784.
150 // const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
151 const int filter_cols = filter_shape.Dims(1) * filter_shape.Dims(2) * filter_shape.Dims(3);
152 const int output_rows = output_shape.Dims(3);
153 // See b/79927784.
154 // const int output_cols = FlatSizeSkipDim(output_shape, 3);
155 const int output_cols = output_shape.Dims(0) * output_shape.Dims(1) * output_shape.Dims(2);
156 assert(output_rows == filter_rows);
157 assert(output_cols == gemm_input_cols);
158 assert(filter_cols == gemm_input_rows);
159 assert(bias_shape.FlatSize() == output_rows);
160 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
161 filter_data, filter_rows, filter_cols);
162 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
163 gemm_input_data, gemm_input_rows, gemm_input_cols);
164 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
165 output_cols);
166 const auto &output_pipeline =
167 GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
168 output_shift, output_activation_min, output_activation_max);
169
170 std::lock_guard<std::mutex> lock_guard(_gemmlowp_mutex);
171 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
172 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
173 output_pipeline);
174}
void Im2col(const T *input_data, const Dims< 4 > &input_dims, int stride_width, int stride_height, int pad_width, int pad_height, int kheight, int kwidth, uint8 byte_zero, T *output_data, const Dims< 4 > &output_dims)
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const Shape &output_shape, T *im2col_data, const int32_t *zero_bytes, const int zero_bytes_len)
int16_t stride_height
Definition Types.h:146
int32_t output_multiplier
Definition Types.h:154
int32_t weights_offset
Definition Types.h:152
int32_t output_offset
Definition Types.h:153
int16_t dilation_width_factor
Definition Types.h:147
int32_t quantized_activation_max
Definition Types.h:158
int16_t dilation_height_factor
Definition Types.h:148
int32_t quantized_activation_min
Definition Types.h:157

References _gemmlowp_mutex, DilatedIm2col(), nnfw::cker::ConvParams::dilation_height_factor, nnfw::cker::ConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::gemm_support::GetGemmLowpContext(), Im2col(), nnfw::cker::ConvParams::input_offset, nnfw::cker::optimized::GemmlowpOutputPipeline::MakeExp(), nnfw::cker::ConvParams::output_multiplier, nnfw::cker::ConvParams::output_offset, output_shape, nnfw::cker::ConvParams::output_shift, nnfw::cker::ConvParams::quantized_activation_max, nnfw::cker::ConvParams::quantized_activation_min, nnfw::cker::ConvParams::stride_height, nnfw::cker::ConvParams::stride_width, and nnfw::cker::ConvParams::weights_offset.

Referenced by nnfw::cker::Conv::operator()().

◆ DepthwiseConvImpl() [1/2]

void nnfw::cker::optimized::DepthwiseConvImpl ( const DepthwiseConvParams params,
const Shape input_shape,
const float *  input_data,
const Shape filter_shape,
const float *  filter_data,
const Shape bias_shape,
const float *  bias_data,
const Shape output_shape,
float *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 1033 of file DepthwiseConvFloat.h.

1038{
1039 const int stride_width = params.stride_width;
1040 const int stride_height = params.stride_height;
1041 const int pad_width = params.padding_values.width;
1042 const int pad_height = params.padding_values.height;
1043 const int depth_multiplier = params.depth_multiplier;
1044 const float output_activation_min = params.float_activation_min;
1045 const float output_activation_max = params.float_activation_max;
1046 const int dilation_width_factor = params.dilation_width_factor;
1047 const int dilation_height_factor = params.dilation_height_factor;
1048 assert(input_shape.DimensionsCount() == 4);
1049 assert(filter_shape.DimensionsCount() == 4);
1050 assert(output_shape.DimensionsCount() == 4);
1051 assert(thread_dim == 0 || thread_dim == 1);
1052
1053 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1054 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1055 const int input_height = input_shape.Dims(1);
1056 const int input_width = input_shape.Dims(2);
1057 const int input_depth = input_shape.Dims(3);
1058 const int filter_height = filter_shape.Dims(1);
1059 const int filter_width = filter_shape.Dims(2);
1060 const int output_height = output_shape.Dims(1);
1061 const int output_width = output_shape.Dims(2);
1062 assert(output_depth == input_depth * depth_multiplier);
1063 assert(bias_shape.FlatSize() == output_depth);
1064
1065 static const int kAccBufferMaxSize = 4832;
1066 float acc_buffer[kAccBufferMaxSize];
1067 assert(kAccBufferMaxSize >= output_depth);
1068 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1069 [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1070 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1071 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1072 assert(kOutputPixelsInAccBuffer >= 1);
1073
1074 // row_accum_func will point to the core accumulation function to be used
1075 // for this DepthwiseConv op.
1076 using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
1077 row_accum_func_t row_accum_func = nullptr;
1078
1079#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1080 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1081 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1082 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1083 { \
1084 row_accum_func = \
1085 FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1086 }
1087
1088#ifdef USE_NEON
1089 // We go over our list of kernels by decreasing order of preference
1090 // for the cases where multiple kernels could apply.
1091
1092 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1093
1096
1097 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1098 // They are a bit less efficient, but allow stride!=1.
1099
1108
1109 // Finally, the kernels allowing a variable input depth,
1110 // these are the least efficient but most general kernels.
1111
1116
1117#endif // USE_NEON
1118
1119#undef TFMINI_USE_DEPTHWISECONV_KERNEL
1120
1121 // No matching fast kernel found, use slow fallback.
1122 if (!row_accum_func)
1123 {
1124 row_accum_func = FloatDepthwiseConvAccumRowGeneric;
1125 }
1126
1127 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1128 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1129 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1130
1131 // Now that we have determined row_accum_func, we can start work.
1132 int batch_start = 0;
1133 int batch_end = batches;
1134 int row_start = 0;
1135 int row_end = output_height;
1136 int output_ptr_offset = 0;
1137
1138 switch (thread_dim)
1139 {
1140 case 0:
1141 // Multithread along with the batch axis
1142 assert(thread_start >= 0);
1143 assert(thread_end <= batches);
1144 batch_start = thread_start;
1145 batch_end = thread_end;
1146 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1147 break;
1148 case 1:
1149 // Multithread along with the row axis
1150 assert(thread_start >= 0);
1151 assert(thread_end <= output_height);
1152 row_start = thread_start;
1153 row_end = thread_end;
1154 output_ptr_offset = row_start * output_width * output_depth;
1155 break;
1156 }
1157
1158 float *output_ptr = output_data + output_ptr_offset;
1159 int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1160
1161 for (int b = batch_start; b < batch_end; ++b)
1162 {
1163 for (int out_y = row_start; out_y < row_end; ++out_y)
1164 {
1165 const int in_y_origin = (out_y * stride_height) - pad_height;
1166 const int filter_y_start =
1167 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1168 const int filter_y_end =
1169 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1170 dilation_height_factor);
1171 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1172 out_x_buffer_start += kOutputPixelsInAccBuffer)
1173 {
1174 const int out_x_buffer_end =
1175 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1176 // We call a 'pixel' a group of activation that share all but the
1177 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1178 // output pixels that we will accumulate in this loop iteration.
1179 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1180 // Initialize our local accumulator with the bias values, so we don't
1181 // have to add them later.
1182 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1183 // Accumulation loop. Most of the time should be spent in here.
1184 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1185 {
1186 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1187 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1188 input_data + in_y * input_height_stride + b * input_batch_stride,
1189 pad_width, depth_multiplier, filter_width,
1190 filter_data + filter_y * filter_height_stride, out_x_buffer_start,
1191 out_x_buffer_end, output_depth, acc_buffer);
1192 }
1193 // Finished accumulating. Now store to destination.
1194 const int num_output_values = output_depth * num_output_pixels;
1195 int i = 0;
1196// TODO(benoitjacob) optimized code goes here
1197#ifdef USE_NEON
1198 // Handle 16 values at a time
1199 for (; i <= num_output_values - 16; i += 16)
1200 {
1201 float32x4_t acc[4];
1202 for (int k = 0; k < 4; k++)
1203 {
1204 acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
1205 }
1206 for (int k = 0; k < 4; k++)
1207 {
1208 acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
1209 vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
1210 }
1211 for (int k = 0; k < 4; k++)
1212 {
1213 vst1q_f32(output_ptr + 4 * k, acc[k]);
1214 }
1215 output_ptr += 16;
1216 }
1217 // Handle 4 values at a time
1218 for (; i <= num_output_values - 4; i += 4)
1219 {
1220 float32x4_t acc = vld1q_f32(acc_buffer + i);
1221
1222 acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
1223 vminq_f32(vdupq_n_f32(output_activation_max), acc));
1224
1225 vst1q_f32(output_ptr, acc);
1226 output_ptr += 4;
1227 }
1228#endif
1229 // Handle leftover values, one by one. This is very slow.
1230 for (; i < num_output_values; i++)
1231 {
1232 float acc = acc_buffer[i];
1233 acc = std::max(output_activation_min, std::min(output_activation_max, acc));
1234
1235 *output_ptr++ = acc;
1236 }
1237 }
1238 }
1239 output_ptr += batch_step;
1240 }
1241}
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)
void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, int input_width, const float *input_data, int pad_width, int depth_multiplier, int filter_width, const float *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, float *acc_buffer)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253
PaddingValues padding_values
Definition Types.h:234

References nnfw::cker::DepthwiseConvParams::depth_multiplier, DepthwiseConvInitAccBuffer(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::FlatSizeSkipDim(), nnfw::cker::DepthwiseConvParams::float_activation_max, nnfw::cker::DepthwiseConvParams::float_activation_min, FloatDepthwiseConvAccumRowGeneric(), nnfw::cker::PaddingValues::height, nnfw::cker::MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, TFMINI_USE_DEPTHWISECONV_KERNEL, and nnfw::cker::PaddingValues::width.

Referenced by nnfw::cker::DepthwiseConv(), and nnfw::cker::DepthwiseConvWorkerTask< T, TS >::Run().

◆ DepthwiseConvImpl() [2/2]

void nnfw::cker::optimized::DepthwiseConvImpl ( const DepthwiseConvParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape filter_shape,
const uint8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
uint8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 2231 of file DepthwiseConvUint8.h.

2237{
2238 return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
2239 bias_shape, bias_data, output_shape, output_data, thread_start,
2240 thread_end, thread_dim);
2241}
void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)

References DepthwiseConvWithRounding(), and output_shape.

◆ DepthwiseConvInitAccBuffer()

void nnfw::cker::optimized::DepthwiseConvInitAccBuffer ( int  num_output_pixels,
int  output_depth,
const float *  bias_data,
float *  acc_buffer 
)
inline

Definition at line 1016 of file DepthwiseConvFloat.h.

1018{
1019 // TODO(benoitjacob): This might need optimized specializations
1020 // for small output_depth values, if that ever becomes an important
1021 // case (like it was for some quantized DepthwiseConv cases).
1022 for (int i = 0; i < num_output_pixels; i++)
1023 {
1024 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1025 }
1026}

Referenced by DepthwiseConvImpl().

◆ DepthwiseConvWithRounding()

void nnfw::cker::optimized::DepthwiseConvWithRounding ( const DepthwiseConvParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape filter_shape,
const uint8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
uint8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 2153 of file DepthwiseConvUint8.h.

2159{
2160 [[maybe_unused]] const int depth_multiplier = params.depth_multiplier;
2161 [[maybe_unused]] const int32_t output_activation_min = params.quantized_activation_min;
2162 [[maybe_unused]] const int32_t output_activation_max = params.quantized_activation_max;
2163 [[maybe_unused]] const int dilation_width_factor = params.dilation_width_factor;
2164 [[maybe_unused]] const int dilation_height_factor = params.dilation_height_factor;
2165 assert(dilation_width_factor >= 1);
2166 assert(dilation_height_factor >= 1);
2167 assert(input_shape.DimensionsCount() == 4);
2168 assert(filter_shape.DimensionsCount() == 4);
2169 assert(output_shape.DimensionsCount() == 4);
2170 assert(output_activation_min <= output_activation_max);
2171 [[maybe_unused]] const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
2172 [[maybe_unused]] const int input_depth = input_shape.Dims(3);
2173 assert(output_depth == input_depth * depth_multiplier);
2174 assert(bias_shape.FlatSize() == output_depth);
2175
2176// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
2177// Jetson TX-2. This compiler does not support the offsetof() macro.
2178#if defined(__aarch64__) && !defined(GOOGLE_L4T)
2179// TODO Use below codes
2180// // Dispatch to dot-product 3x3 kernels when supported.
2181//
2182// ruy::Context *ruy_context = cpu_backend_context->ruy_context();
2183// const bool has_dot_product_instructions =
2184// ruy_context != nullptr &&
2185// (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
2186// if (has_dot_product_instructions)
2187// {
2188// using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
2189// DotProduct3x3KernelType kernel_type =
2190// optimized_ops::depthwise_conv::CategorizeDotProductKernel(
2191// input_shape, filter_shape, params);
2192// if (kernel_type != DotProduct3x3KernelType::kNone)
2193// {
2194// optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
2195// DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
2196// filter_shape, filter_data,
2197// bias_shape,
2198// bias_data, output_shape,
2199// output_data);
2200// return;
2201// }
2202// }
2203//
2204// // Dispatch to non-dot-product 3x3 kernels when supported.
2205//
2206// const int stride_width = params.stride_width;
2207// const int stride_height = params.stride_height;
2208// const int pad_width = params.padding_values.width;
2209// const int pad_height = params.padding_values.height;
2210// const int output_shift = params.output_shift;
2211//
2212// // Call kernel optimized for depthwise convolutions using 3x3 filters if
2213// // parameters are supported.
2214// if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
2215// stride_height, dilation_width_factor,
2216// dilation_height_factor, pad_width, pad_height,
2217// depth_multiplier, output_shape, output_shift))
2218// {
2219// depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
2220// params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
2221// output_shape, output_data, thread_start, thread_end, thread_dim);
2222// return;
2223// }
2224#endif
2225
2226 depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
2227 bias_shape, bias_data, output_shape, output_data,
2228 thread_start, thread_end, thread_dim);
2229}

References nnfw::cker::DepthwiseConvParams::depth_multiplier, nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), nnfw::cker::MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::quantized_activation_max, and nnfw::cker::DepthwiseConvParams::quantized_activation_min.

Referenced by DepthwiseConvImpl().

◆ DilatedIm2col() [1/2]

template<typename T >
void nnfw::cker::optimized::DilatedIm2col ( const ConvParams params,
const Shape input_shape,
const T *  input_data,
const Shape filter_shape,
const Shape output_shape,
T *  im2col_data,
const int32_t *  zero_bytes,
const int  zero_bytes_len 
)

Definition at line 121 of file OptimizedUtils.h.

124{
125 const int stride_width = params.stride_width;
126 const int stride_height = params.stride_height;
127 const int dilation_width_factor = params.dilation_width_factor;
128 const int dilation_height_factor = params.dilation_height_factor;
129 const int pad_width = params.padding_values.width;
130 const int pad_height = params.padding_values.height;
131 assert(input_shape.DimensionsCount() == 4);
132 assert(filter_shape.DimensionsCount() == 4);
133 assert(output_shape.DimensionsCount() == 4);
134
135 // For dilated convolution, the input pixels are not contiguous therefore we
136 // can't use the same optimizations as Im2Col(). Though note this code would
137 // work fine for the non-dilated case too (though likely a bit slower).
138 assert(dilation_width_factor != 1 || dilation_height_factor != 1);
139 assert(im2col_data);
140 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
141 const int input_height = input_shape.Dims(1);
142 const int input_width = input_shape.Dims(2);
143 const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
144 const int filter_height = filter_shape.Dims(1);
145 const int filter_width = filter_shape.Dims(2);
146 const int output_height = output_shape.Dims(1);
147 const int output_width = output_shape.Dims(2);
148 MatchingDim(output_shape, 3, filter_shape, 0);
149
150 // Construct the MxN sized im2col matrix.
151 // The rows M, are sub-ordered B x H x W
152 const Shape row_shape({1, batches, output_height, output_width});
153 // The columns, N, are sub-ordered Kh x Kw x Din
154 const Shape col_shape({1, filter_height, filter_width, input_depth});
155 // Use dimensions M and N to construct dims for indexing directly into im2col
156 const Shape im2col_shape({1, 1, row_shape.FlatSize(), col_shape.FlatSize()});
157
158 // Loop through the output rows (B x H x W)
159 for (int batch = 0; batch < batches; ++batch)
160 {
161 const T zero_byte =
162 zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
163 for (int out_y = 0; out_y < output_height; ++out_y)
164 {
165 for (int out_x = 0; out_x < output_width; ++out_x)
166 {
167 // Each im2col row is an output pixel. Arrange the input data in this
168 // row in an order we can conveniently multiply with the filter data.
169 int row_offset = Offset(row_shape, 0, batch, out_y, out_x);
170 const int in_x_origin = (out_x * stride_width) - pad_width;
171 const int in_y_origin = (out_y * stride_height) - pad_height;
172 // Loop through all the pixels of the filter (Kh x Kw)
173 for (int filter_y = 0; filter_y < filter_height; ++filter_y)
174 {
175 const int in_y = in_y_origin + dilation_height_factor * filter_y;
176 if ((in_y >= 0) && (in_y < input_height))
177 {
178 // Filter row is within the input data.
179 // Loop through all the filter pixels in this row.
180 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
181 {
182 const int in_x = in_x_origin + dilation_width_factor * filter_x;
183 int col_offset = Offset(col_shape, 0, filter_y, filter_x, 0);
184 T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
185 if ((in_x >= 0) && (in_x < input_width))
186 {
187 // Filter pixel is within the input, copy the input data.
188 T const *src = input_data + Offset(input_shape, batch, in_y, in_x, 0);
189 memcpy(dst, src, input_depth * sizeof(T));
190 }
191 else
192 {
193 // Filter pixel is outside the input, zero it out.
194 memset(dst, zero_byte, input_depth * sizeof(T));
195 }
196 }
197 }
198 else
199 {
200 // Filter row is outside the input, zero out the entire filter row.
201 int col_offset = Offset(col_shape, 0, filter_y, 0, 0);
202 T *dst = im2col_data + Offset(im2col_shape, 0, 0, row_offset, col_offset);
203 memset(dst, zero_byte, filter_width * input_depth * sizeof(T));
204 }
205 }
206 }
207 }
208 }
209}
int Offset(const Dims< 4 > &dims, int i0, int i1, int i2, int i3)
Definition Dims.h:64
Definition Shape.h:28
PaddingValues padding_values
Definition Types.h:143

References nnfw::cker::ConvParams::dilation_height_factor, nnfw::cker::ConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::PaddingValues::height, nnfw::cker::MatchingDim(), nnfw::cker::Offset(), output_shape, nnfw::cker::ConvParams::padding_values, nnfw::cker::ConvParams::stride_height, nnfw::cker::ConvParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by Conv().

◆ DilatedIm2col() [2/2]

template<typename T >
void nnfw::cker::optimized::DilatedIm2col ( const ConvParams params,
uint8_t  zero_byte,
const Shape input_shape,
const T *  input_data,
const Shape filter_shape,
const Shape output_shape,
T *  im2col_data 
)

Definition at line 212 of file OptimizedUtils.h.

215{
216 const int32_t zero_point = static_cast<int32_t>(zero_byte);
217 DilatedIm2col<T>(params, input_shape, input_data, filter_shape, output_shape, im2col_data,
218 &zero_point, 1);
219}

References output_shape.

◆ Div()

void nnfw::cker::optimized::Div ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 1218 of file BinaryArithmeticOps.h.

1221{
1222#ifdef __aarch64__
1223 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
1224 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
1225 (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
1226#else
1227 const std::function<float(const float &, const float &)> fn =
1228 [](const float &a, const float &b) -> float { return a / b; };
1229 reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
1230 output_shape, output_data, fn);
1231#endif // __aarch64__
1232}

References nnfw::cker::reference::BinaryArithmeticOp(), nnfw::cker::MatchingElementsSize(), and output_shape.

Referenced by nnfw::cker::BinaryArithmeticOp().

◆ ExtractPatchIntoBufferColumn()

template<typename T >
void nnfw::cker::optimized::ExtractPatchIntoBufferColumn ( const Shape input_shape,
int  w,
int  h,
int  b,
int  kheight,
int  kwidth,
int  stride_width,
int  stride_height,
int  pad_width,
int  pad_height,
int  in_width,
int  in_height,
int  in_depth,
int  single_buffer_length,
int  buffer_id,
const T *  in_data,
T *  conv_buffer_data,
uint8_t  zero_byte 
)
inline

Definition at line 34 of file OptimizedUtils.h.

39{
40 assert(input_shape.DimensionsCount() == 4);
41 // This chunk of code reshapes all the inputs corresponding to
42 // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
43 const int kwidth_times_indepth = kwidth * in_depth;
44 const int inwidth_times_indepth = in_width * in_depth;
45 const int ih_ungated_start = h * stride_height - pad_height;
46 const int ih_ungated_end = (ih_ungated_start + kheight);
47 const int ih_end = std::min(ih_ungated_end, in_height);
48 const int iw_ungated_start = w * stride_width - pad_width;
49 const int iw_ungated_end = (iw_ungated_start + kwidth);
50 const int iw_end = std::min(iw_ungated_end, in_width);
51 // If the patch is off the edge of the input image, skip writing those rows
52 // and columns from the patch into the output array.
53 const int h_offset = std::max(0, -ih_ungated_start);
54 const int w_offset = std::max(0, -iw_ungated_start);
55 const int ih_start = std::max(0, ih_ungated_start);
56 const int iw_start = std::max(0, iw_ungated_start);
57 const int single_row_num = std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
58 const int output_row_offset = (buffer_id * single_buffer_length);
59 int out_offset = output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
60 int in_offset = Offset(input_shape, b, ih_start, iw_start, 0);
61
62 // Express all of the calculations as padding around the input patch.
63 const int top_padding = h_offset;
64 const int bottom_padding = (ih_ungated_end - ih_end);
65 const int left_padding = w_offset;
66 const int right_padding = (iw_ungated_end - iw_end);
67 assert(single_row_num == ((kwidth - (left_padding + right_padding)) * in_depth));
68
69 // Write out zeroes to the elements representing the top rows of the input
70 // patch that are off the edge of the input image.
71 if (top_padding > 0)
72 {
73 const int top_row_elements = (top_padding * kwidth * in_depth);
74 memset(conv_buffer_data + output_row_offset, zero_byte, (top_row_elements * sizeof(T)));
75 }
76
77 // If the patch is on the interior of the input image horizontally, just copy
78 // over the rows sequentially, otherwise add zero padding at the start or end.
79 if ((left_padding == 0) && (right_padding == 0))
80 {
81 for (int ih = ih_start; ih < ih_end; ++ih)
82 {
83 memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
84 out_offset += kwidth_times_indepth;
85 in_offset += inwidth_times_indepth;
86 }
87 }
88 else
89 {
90 for (int ih = ih_start; ih < ih_end; ++ih)
91 {
92 if (left_padding > 0)
93 {
94 const int left_start = (out_offset - (left_padding * in_depth));
95 memset(conv_buffer_data + left_start, zero_byte, (left_padding * in_depth * sizeof(T)));
96 }
97 memcpy(conv_buffer_data + out_offset, in_data + in_offset, single_row_num * sizeof(T));
98 if (right_padding > 0)
99 {
100 const int right_start = (out_offset + single_row_num);
101 memset(conv_buffer_data + right_start, zero_byte, (right_padding * in_depth * sizeof(T)));
102 }
103 out_offset += kwidth_times_indepth;
104 in_offset += inwidth_times_indepth;
105 }
106 }
107
108 // If the bottom of the patch falls off the input image, pad the values
109 // representing those input rows with zeroes.
110 if (bottom_padding > 0)
111 {
112 const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
113 const int bottom_start =
114 output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
115 memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
116 }
117}

References nnfw::cker::Shape::DimensionsCount(), and nnfw::cker::Offset().

Referenced by Im2col().

◆ FloatDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void nnfw::cker::optimized::FloatDepthwiseConvAccumRow ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const float *  input_data,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const float *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
float *  acc_buffer 
)

Definition at line 908 of file DepthwiseConvFloat.h.

912{
913 // Sanity check parameters. This is important in particular to ensure
914 // that we keep the number of template instantiations minimal, so we don't
915 // increase binary size unnecessarily.
916 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
917 static_assert(kFixedInputDepth || kAllowStrided, "");
918 assert(stride == 1 || kAllowStrided);
919 if (kFixedInputDepth)
920 {
921 assert(input_depth == kFixedInputDepth);
922 }
923 if (kFixedDepthMultiplier)
924 {
925 assert(depth_multiplier == kFixedDepthMultiplier);
926 }
927 assert(output_depth == input_depth * depth_multiplier);
928 const int input_ptr_increment = stride * input_depth;
929 const float *filter_base_ptr = filter_data;
930 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
931 {
932 // For the current (filter_x, filter_y) point in the filter,
933 // compute the boundaries of the corresponding output row segment.
934 int out_x_loop_start_unclamped = 0;
935 int out_x_loop_end_unclamped = 0;
936 if (kAllowStrided)
937 {
938 if (stride == 2)
939 {
940 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
941 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
942 }
943 else if (stride == 4)
944 {
945 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
946 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
947 }
948 else
949 {
950 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
951 out_x_loop_end_unclamped =
952 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
953 }
954 }
955 else
956 {
957 out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
958 out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
959 }
960 // The kernel will have to iterate on the segment of the
961 // output row that starts at out_x_loop_start and out_x_loop_end.
962 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
963 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
964
965 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
966 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
967 const float *input_ptr = input_data + in_x_origin * input_depth;
968 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
969 FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
970 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
971 filter_base_ptr, acc_buffer_ptr);
972 filter_base_ptr += output_depth;
973 }
974}
list input_data
Definition infer.py:29

◆ FloatDepthwiseConvAccumRowGeneric()

void nnfw::cker::optimized::FloatDepthwiseConvAccumRowGeneric ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const float *  input_data,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const float *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
float *  acc_buffer 
)
inline

Definition at line 977 of file DepthwiseConvFloat.h.

983{
984 const float *filter_base_ptr = filter_data;
985 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
986 {
987 const int out_x_loop_start =
988 std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
989 const int out_x_loop_end =
990 std::min(out_x_buffer_end,
991 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
992
993 float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
994 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
995 const float *input_ptr = input_data + in_x_origin * input_depth;
996 const int input_ptr_increment = (stride - 1) * input_depth;
997 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
998 {
999 const float *filter_ptr = filter_base_ptr;
1000 for (int ic = 0; ic < input_depth; ++ic)
1001 {
1002 const float input_val = *input_ptr++;
1003 for (int m = 0; m < depth_multiplier; m++)
1004 {
1005 const float filter_val = *filter_ptr++;
1006 *acc_buffer_ptr++ += filter_val * input_val;
1007 }
1008 }
1009 input_ptr += input_ptr_increment;
1010 }
1011 filter_base_ptr += output_depth;
1012 }
1013}

References m.

Referenced by DepthwiseConvImpl().

◆ getBinaryOpWithActivationImplFloat()

template<class FUNC >
BinaryOpImplFloatFuncs nnfw::cker::optimized::getBinaryOpWithActivationImplFloat ( const BinaryArithmeticOpParam params)
inline

Definition at line 676 of file BinaryArithmeticOps.h.

677{
678 if (params.float_activation_max == std::numeric_limits<float>::max())
679 if (params.float_activation_min == std::numeric_limits<float>::lowest())
680 return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>,
681 BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>);
682 else
683 return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>,
684 BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>);
685 else
686 return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>,
687 BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
688}
std::pair< void(*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), void(*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)> BinaryOpImplFloatFuncs

References nnfw::cker::BinaryArithmeticOpParam::float_activation_max, and nnfw::cker::BinaryArithmeticOpParam::float_activation_min.

◆ Im2col()

template<typename T >
void nnfw::cker::optimized::Im2col ( const ConvParams params,
int  kheight,
int  kwidth,
uint8_t  zero_byte,
const Shape input_shape,
const T *  input_data,
const Shape output_shape,
T *  output_data 
)

Definition at line 222 of file OptimizedUtils.h.

225{
226 const int stride_width = params.stride_width;
227 const int stride_height = params.stride_height;
228 const int pad_width = params.padding_values.width;
229 const int pad_height = params.padding_values.height;
230 assert(input_shape.DimensionsCount() == 4);
231 assert(output_shape.DimensionsCount() == 4);
232
233 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
234 const int input_depth = input_shape.Dims(3);
235 const int input_width = input_shape.Dims(2);
236 const int input_height = input_shape.Dims(1);
237 const int output_depth = output_shape.Dims(3);
238 const int output_width = output_shape.Dims(2);
239 const int output_height = output_shape.Dims(1);
240
241 int buffer_id = 0;
242 // Loop over the output nodes.
243 for (int b = 0; b < batches; ++b)
244 {
245 for (int h = 0; h < output_height; ++h)
246 {
247 for (int w = 0; w < output_width; ++w)
248 {
249 ExtractPatchIntoBufferColumn(input_shape, w, h, b, kheight, kwidth, stride_width,
250 stride_height, pad_width, pad_height, input_width,
251 input_height, input_depth, output_depth, buffer_id, input_data,
252 output_data, zero_byte);
253 ++buffer_id;
254 }
255 }
256 }
257}
void ExtractPatchIntoBufferColumn(const Dims< 4 > &input_dims, int w, int h, int b, int kheight, int kwidth, int stride_width, int stride_height, int pad_width, int pad_height, int in_width, int in_height, int in_depth, int single_buffer_length, int buffer_id, const T *in_data, T *conv_buffer_data, uint8 byte_zero)

References nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), ExtractPatchIntoBufferColumn(), nnfw::cker::PaddingValues::height, nnfw::cker::MatchingDim(), output_shape, nnfw::cker::ConvParams::padding_values, nnfw::cker::ConvParams::stride_height, nnfw::cker::ConvParams::stride_width, and nnfw::cker::PaddingValues::width.

Referenced by Conv().

◆ Mul() [1/2]

void nnfw::cker::optimized::Mul ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 1072 of file BinaryArithmeticOps.h.

1075{
1076 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
1077 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
1078 (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
1079}

References nnfw::cker::MatchingElementsSize(), and output_shape.

◆ Mul() [2/2]

template<typename T >
std::enable_if_t< is_quant8< T >::value > nnfw::cker::optimized::Mul ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const T *  input1_data,
const Shape input2_shape,
const T *  input2_data,
const Shape output_shape,
T *  output_data 
)
inline

Definition at line 1065 of file BinaryArithmeticOps.h.

1067{
1068 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
1069 MulElementwise(flat_size, params, input1_data, input2_data, output_data);
1070}
void MulElementwise(int size, const BinaryArithmeticOpParam &params, const uint8_t *input1_data, const uint8_t *input2_data, uint8_t *output_data)

References nnfw::cker::MatchingElementsSize(), MulElementwise(), and output_shape.

Referenced by nnfw::cker::BinaryArithmeticOp(), and nnfw::cker::BinaryArithmeticOp().

◆ MulElementwise() [1/2]

void nnfw::cker::optimized::MulElementwise ( int  size,
const BinaryArithmeticOpParam params,
const int8_t *  input1_data,
const int8_t *  input2_data,
int8_t *  output_data 
)
inline

Definition at line 974 of file BinaryArithmeticOps.h.

977{
978 int i = 0;
979#ifdef USE_NEON
980 const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
981 const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
982 const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
983 const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
984 const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
985 const int left_shift = std::max(0, params.output_shift);
986 const int right_shift = std::max(0, -params.output_shift);
987 const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
988 for (; i <= size - 16; i += 16)
989 {
990 // We load / store 16 at a time, multiplying as four sets of 4 int32s.
991 const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
992 const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
993
994 const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
995 const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
996
997 const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
998 const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
999 const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector);
1000 const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
1001 const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector);
1002 const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
1003 const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
1004 const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
1005 const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
1006 const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
1007 const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
1008 const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
1009 const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
1010 const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
1011
1012 auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
1013 auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
1014 auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
1015 auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
1016
1017 p1 = vshlq_s32(p1, left_shift_vec);
1018 p2 = vshlq_s32(p2, left_shift_vec);
1019 p3 = vshlq_s32(p3, left_shift_vec);
1020 p4 = vshlq_s32(p4, left_shift_vec);
1021
1022 p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
1023 p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
1024 p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
1025 p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
1026 using gemmlowp::RoundingDivideByPOT;
1027 p1 = RoundingDivideByPOT(p1, right_shift);
1028 p2 = RoundingDivideByPOT(p2, right_shift);
1029 p3 = RoundingDivideByPOT(p3, right_shift);
1030 p4 = RoundingDivideByPOT(p4, right_shift);
1031
1032 const auto p1_narrowed = vqmovn_s32(p1);
1033 const auto p2_narrowed = vqmovn_s32(p2);
1034 const auto p3_narrowed = vqmovn_s32(p3);
1035 const auto p4_narrowed = vqmovn_s32(p4);
1036
1037 const int16x8_t p_part1 =
1038 vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
1039 const int16x8_t p_part2 =
1040 vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
1041 const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
1042
1043 const auto clamped =
1044 vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
1045 vst1q_s8(output_data + i, clamped);
1046 }
1047#endif // NEON
1048
1049 for (; i < size; ++i)
1050 {
1051 const int32_t input1_val = params.input1_offset + input1_data[i];
1052 const int32_t input2_val = params.input2_offset + input2_data[i];
1053 const int32_t unclamped_result =
1054 params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
1055 params.output_multiplier,
1056 params.output_shift);
1057 const int32_t clamped_output = std::min(
1058 params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
1059 output_data[i] = static_cast<int8_t>(clamped_output);
1060 }
1061}
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96

References nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::MultiplyByQuantizedMultiplier(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

◆ MulElementwise() [2/2]

void nnfw::cker::optimized::MulElementwise ( int  size,
const BinaryArithmeticOpParam params,
const uint8_t *  input1_data,
const uint8_t *  input2_data,
uint8_t *  output_data 
)
inline

Definition at line 910 of file BinaryArithmeticOps.h.

913{
914 int i = 0;
915
916#ifdef USE_NEON
917 const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
918 const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
919 const auto output_offset_vector = vdupq_n_s16(params.output_offset);
920 const auto output_activation_min_vector = vdup_n_u8(params.quantized_activation_min);
921 const auto output_activation_max_vector = vdup_n_u8(params.quantized_activation_max);
922 const int left_shift = std::max(0, params.output_shift);
923 const int right_shift = std::max(0, -params.output_shift);
924 const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
925 for (; i <= size - 8; i += 8)
926 {
927 // We load / store 8 at a time, multiplying as two sets of 4 int32s.
928 const auto input1_val_original = vld1_u8(input1_data + i);
929 const auto input2_val_original = vld1_u8(input2_data + i);
930 const auto input1_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
931 const auto input2_val_s16 = vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
932 const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
933 const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
934
935 const auto input1_val_low = vget_low_s16(input1_val);
936 const auto input1_val_high = vget_high_s16(input1_val);
937 const auto input2_val_low = vget_low_s16(input2_val);
938 const auto input2_val_high = vget_high_s16(input2_val);
939
940 auto p1 = vmull_s16(input2_val_low, input1_val_low);
941 auto p2 = vmull_s16(input2_val_high, input1_val_high);
942
943 p1 = vshlq_s32(p1, left_shift_vec);
944 p2 = vshlq_s32(p2, left_shift_vec);
945 p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
946 p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
947 using gemmlowp::RoundingDivideByPOT;
948 p1 = RoundingDivideByPOT(p1, right_shift);
949 p2 = RoundingDivideByPOT(p2, right_shift);
950
951 const auto p1_narrowed = vqmovn_s32(p1);
952 const auto p2_narrowed = vqmovn_s32(p2);
953 const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
954 const auto clamped =
955 vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
956 vst1_u8(output_data + i, clamped);
957 }
958#endif // NEON
959
960 for (; i < size; ++i)
961 {
962 const int32_t input1_val = params.input1_offset + input1_data[i];
963 const int32_t input2_val = params.input2_offset + input2_data[i];
964 const int32_t unclamped_result =
965 params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
966 params.output_multiplier,
967 params.output_shift);
968 const int32_t clamped_output = std::min(
969 params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
970 output_data[i] = static_cast<uint8_t>(clamped_output);
971 }
972}

References nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::MultiplyByQuantizedMultiplier(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

Referenced by BroadcastMulDispatch(), and Mul().

◆ MulSimpleBroadcast() [1/2]

void nnfw::cker::optimized::MulSimpleBroadcast ( int  size,
const BinaryArithmeticOpParam params,
const int8_t  broadcast_value,
const int8_t *  input2_data,
int8_t *  output_data 
)
inline

Definition at line 1095 of file BinaryArithmeticOps.h.

1098{
1099 const int16_t input1_val = params.input1_offset + broadcast_value;
1100
1101 int i = 0;
1102#ifdef USE_NEON
1103 const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
1104 const auto output_offset_vector = vdupq_n_s16(params.output_offset);
1105 const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
1106 const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
1107 const int left_shift = std::max(0, params.output_shift);
1108 const int right_shift = std::max(0, -params.output_shift);
1109 const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
1110 for (; i <= size - 16; i += 16)
1111 {
1112 // We load / store 16 at a time, multiplying as four sets of 4 int32s.
1113 const auto input2_val_original = vld1q_s8(input2_data + i);
1114 const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
1115 const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
1116
1117 const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
1118 const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
1119
1120 const auto input2_val_low_low = vget_low_s16(input2_val_low);
1121 const auto input2_val_low_high = vget_high_s16(input2_val_low);
1122 const auto input2_val_high_low = vget_low_s16(input2_val_high);
1123 const auto input2_val_high_high = vget_high_s16(input2_val_high);
1124
1125 auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
1126 auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
1127 auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
1128 auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
1129
1130 p1 = vshlq_s32(p1, left_shift_vec);
1131 p2 = vshlq_s32(p2, left_shift_vec);
1132 p3 = vshlq_s32(p3, left_shift_vec);
1133 p4 = vshlq_s32(p4, left_shift_vec);
1134
1135 p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
1136 p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
1137 p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
1138 p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
1139 using gemmlowp::RoundingDivideByPOT;
1140 p1 = RoundingDivideByPOT(p1, right_shift);
1141 p2 = RoundingDivideByPOT(p2, right_shift);
1142 p3 = RoundingDivideByPOT(p3, right_shift);
1143 p4 = RoundingDivideByPOT(p4, right_shift);
1144
1145 const auto p1_narrowed = vqmovn_s32(p1);
1146 const auto p2_narrowed = vqmovn_s32(p2);
1147 const auto p3_narrowed = vqmovn_s32(p3);
1148 const auto p4_narrowed = vqmovn_s32(p4);
1149
1150 const int16x8_t p_part1 =
1151 vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
1152 const int16x8_t p_part2 =
1153 vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
1154 const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
1155
1156 const auto clamped =
1157 vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
1158 vst1q_s8(output_data + i, clamped);
1159 }
1160#endif // NEON
1161
1162 for (; i < size; ++i)
1163 {
1164 const int32_t input2_val = params.input2_offset + input2_data[i];
1165 const int32_t unclamped_result =
1166 params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
1167 params.output_multiplier,
1168 params.output_shift);
1169 const int32_t clamped_output = std::min(
1170 params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
1171 output_data[i] = static_cast<int8_t>(clamped_output);
1172 }
1173}

References nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::MultiplyByQuantizedMultiplier(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min, and size.

◆ MulSimpleBroadcast() [2/2]

void nnfw::cker::optimized::MulSimpleBroadcast ( int  size,
const BinaryArithmeticOpParam params,
const uint8_t  broadcast_value,
const uint8_t *  input2_data,
uint8_t *  output_data 
)
inline

Definition at line 1081 of file BinaryArithmeticOps.h.

1084{
1085 int i = 0;
1086 int32_t clamped_output;
1087 for (; i < size; ++i)
1088 {
1089 clamped_output = quant8_mul(params, broadcast_value, input2_data[i]);
1090 output_data[i] = static_cast<uint8_t>(clamped_output);
1091 }
1092}

References quant8_mul(), and size.

Referenced by BroadcastMulDispatch().

◆ quant8_mul()

template<typename T >
std::enable_if_t< is_quant8< T >::value, int32_t > nnfw::cker::optimized::quant8_mul ( const BinaryArithmeticOpParam params,
const T  input1_data,
const T  input2_data 
)
inline

Definition at line 896 of file BinaryArithmeticOps.h.

897{
898 const int32_t input1_val = params.input1_offset + input1_data;
899 const int32_t input2_val = params.input2_offset + input2_data;
900 const int32_t unclamped_result =
901 params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
902 params.output_multiplier,
903 params.output_shift);
904 const int32_t clamped_output = std::min(
905 params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
906
907 return clamped_output;
908}

References nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::MultiplyByQuantizedMultiplier(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, and nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min.

Referenced by BroadcastMulDispatch(), and MulSimpleBroadcast().

◆ quant8_sum()

template<typename T >
std::enable_if_t< is_quant8< T >::value, int32_t > nnfw::cker::optimized::quant8_sum ( const BinaryArithmeticOpParam params,
const T  input1_data,
const T  input2_data 
)
inline

Definition at line 227 of file BinaryArithmeticOps.h.

228{
229 const int32_t input1_val = params.input1_offset + input1_data;
230 const int32_t input2_val = params.input2_offset + input2_data;
231 const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
232 const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
233 const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
234 shifted_input1_val, params.input1_multiplier, params.input1_shift);
235 const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
236 shifted_input2_val, params.input2_multiplier, params.input2_shift);
237 const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
238 const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
239 raw_sum, params.output_multiplier, params.output_shift) +
240 params.output_offset;
241 const int32_t clamped_output = std::min(params.quantized_activation_max,
242 std::max(params.quantized_activation_min, raw_output));
243 return clamped_output;
244}

References nnfw::cker::BinaryArithmeticOpParam::input1_multiplier, nnfw::cker::BinaryArithmeticOpParam::input1_offset, nnfw::cker::BinaryArithmeticOpParam::input1_shift, nnfw::cker::BinaryArithmeticOpParam::input2_multiplier, nnfw::cker::BinaryArithmeticOpParam::input2_offset, nnfw::cker::BinaryArithmeticOpParam::input2_shift, nnfw::cker::BinaryArithmeticOpParam::left_shift, nnfw::cker::MultiplyByQuantizedMultiplierSmallerThanOneExp(), nnfw::cker::BinaryArithmeticOpParam::output_multiplier, nnfw::cker::BinaryArithmeticOpParam::output_offset, nnfw::cker::BinaryArithmeticOpParam::output_shift, nnfw::cker::BinaryArithmeticOpParam::quantized_activation_max, and nnfw::cker::BinaryArithmeticOpParam::quantized_activation_min.

Referenced by AddScalarBroadcast(), and BroadcastAddDispatch().

◆ Sub()

void nnfw::cker::optimized::Sub ( const BinaryArithmeticOpParam params,
const Shape input1_shape,
const float *  input1_data,
const Shape input2_shape,
const float *  input2_data,
const Shape output_shape,
float *  output_data 
)
inline

Definition at line 858 of file BinaryArithmeticOps.h.

861{
862 const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
863 auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
864 (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
865}

References nnfw::cker::MatchingElementsSize(), and output_shape.

Referenced by nnfw::cker::BinaryArithmeticOp().

Variable Documentation

◆ _gemmlowp_mutex

std::mutex nnfw::cker::optimized::_gemmlowp_mutex

Definition at line 45 of file Conv.h.

Referenced by Conv().