Data Structures
struct	QuantizedDepthwiseConvKernel

Functions
template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void	QuantizedDepthwiseConvAccumRow (int stride, int dilation_factor, int input_depth, int input_width, const int8_t input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const int8_t filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)

void	QuantizedDepthwiseConvAccumRowGeneric (int stride, int dilation_factor, int input_depth, int input_width, const int8_t input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const int8_t filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)

void	DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const int32_t bias_data, int32_t acc_buffer)

void	DepthwiseConvGeneral (const DepthwiseConvParams &params, const int32_t output_multiplier, const int32_t output_shift, const Shape &input_shape, const int8_t input_data, const Shape &filter_shape, const int8_t filter_data, const Shape &, const int32_t bias_data, const Shape &output_shape, int8_t output_data, int thread_start, int thread_end, int thread_dim)

Function Documentation

◆ DepthwiseConvGeneral()

void nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral	(	const DepthwiseConvParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	filter_shape,
		const int8_t *	filter_data,
		const Shape &	,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		int8_t *	output_data,
		int	thread_start,
		int	thread_end,
		int	thread_dim
	)

inline

Definition at line 1739 of file DepthwiseConvInt8.h.

{
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
  const int32_t input_offset = params.input_offset;
  const int32_t output_offset = params.output_offset;
  const int dilation_width_factor = params.dilation_width_factor;
  const int dilation_height_factor = params.dilation_height_factor;
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int input_depth = input_shape.Dims(3);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int output_rows = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
 
  static const int kAccBufferMaxSize = 2048;
  int32_t acc_buffer[kAccBufferMaxSize];
  assert(kAccBufferMaxSize >= output_depth);
  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
  [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
  assert(kAccBufferActualSize <= kAccBufferMaxSize);
  assert(kOutputPixelsInAccBuffer >= 1);
  assert(thread_dim == 0 || thread_dim == 1);
 
  // row_accum_func will point to the core accumulation function to be used
  // for this DepthwiseConv op.
  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
  row_accum_func_t row_accum_func = nullptr;
 
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
  {                                                                                               \
    row_accum_func =                                                                              \
      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
  }
 
#ifdef USE_NEON
  // We go over our list of kernels by decreasing order of preference
  // for the cases where multiple kernels could apply.
 
  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
 
  // Next come the strided kernels: AllowStrided=true, fixed input depth.
  // They are a bit less efficient, but allow stride!=1.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
 
  // Finally, the kernels allowing a variable input depth,
  // these are the least efficient but most general kernels.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
#endif // USE_NEON
 
  // No matching fast kernel found, use slow fallback.
  if (!row_accum_func)
  {
    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
  }
 
#undef TFMINI_USE_DEPTHWISECONV_KERNEL
 
  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
 
  // Now that we have determined row_accum_func, we can start work.
  int batch_start = 0;
  int batch_end = batches;
  int row_start = 0;
  int row_end = output_rows;
  int output_ptr_offset = 0;
 
  switch (thread_dim)
  {
    case 0:
      assert(thread_start >= 0);
      assert(thread_end <= batches);
      batch_start = thread_start;
      batch_end = thread_end;
      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
      break;
    case 1:
      assert(thread_start >= 0);
      assert(thread_end <= output_rows);
      row_start = thread_start;
      row_end = thread_end;
      output_ptr_offset = row_start * output_width * output_depth;
      break;
  }
 
  int8_t *output_ptr = output_data + output_ptr_offset;
  int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
  for (int b = batch_start; b < batch_end; ++b)
  {
    for (int out_y = row_start; out_y < row_end; ++out_y)
    {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      const int filter_y_start =
        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
      const int filter_y_end =
        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
                                  dilation_height_factor);
      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
           out_x_buffer_start += kOutputPixelsInAccBuffer)
      {
        const int out_x_buffer_end =
          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
        // We call a 'pixel' a group of activation that share all but the
        // 'depth'/'channel' coordinate. num_output_pixels is the number of
        // output pixels that we will accumulate in this loop iteration.
        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
        // Initialize our local accumulator with the bias values, so we don't
        // have to add them later.
        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
        // Accumulation loop. Most of the time should be spent in here.
        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
        {
          const int in_y = in_y_origin + dilation_height_factor * filter_y;
          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
                         input_data + in_y * input_height_stride + b * input_batch_stride,
                         input_offset, pad_width, depth_multiplier, filter_width,
                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
                         out_x_buffer_end, output_depth, acc_buffer);
        }
        // Finished accumulating int32_t values. Now need to convert them to
        // the final 8bit form and store them.
        const int num_output_values = output_depth * num_output_pixels;
 
        Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
                 output_activation_min, output_activation_max, acc_buffer, output_ptr);
 
        output_ptr += num_output_values;
      }
    }
    output_ptr += batch_step;
  }
}

Referenced by nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding().

◆ DepthwiseConvInitAccBuffer()

void nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvInitAccBuffer	(	int	num_output_pixels,
		int	output_depth,
		const int32_t *	bias_data,
		int32_t *	acc_buffer
	)

inline

Definition at line 1649 of file DepthwiseConvInt8.h.

{
  int i = 0;
#ifdef USE_NEON
  if (output_depth == 1)
  {
    const int32x4_t b = vdupq_n_s32(bias_data[0]);
    for (; i <= num_output_pixels - 16; i += 16)
    {
      vst1q_s32(acc_buffer + i + 0, b);
      vst1q_s32(acc_buffer + i + 4, b);
      vst1q_s32(acc_buffer + i + 8, b);
      vst1q_s32(acc_buffer + i + 12, b);
    }
    for (; i <= num_output_pixels - 4; i += 4)
    {
      vst1q_s32(acc_buffer + i, b);
    }
  }
  else if (output_depth == 2)
  {
    int32x4_t b = vdupq_n_s32(bias_data[0]);
    b = vsetq_lane_s32(bias_data[1], b, 1);
    b = vsetq_lane_s32(bias_data[1], b, 3);
    for (; i <= num_output_pixels - 8; i += 8)
    {
      vst1q_s32(acc_buffer + 2 * i + 0, b);
      vst1q_s32(acc_buffer + 2 * i + 4, b);
      vst1q_s32(acc_buffer + 2 * i + 8, b);
      vst1q_s32(acc_buffer + 2 * i + 12, b);
    }
    for (; i <= num_output_pixels - 2; i += 2)
    {
      vst1q_s32(acc_buffer + 2 * i, b);
    }
  }
  else if (output_depth == 4)
  {
    const int32x4_t b = vld1q_s32(bias_data);
    for (; i <= num_output_pixels - 4; i += 4)
    {
      vst1q_s32(acc_buffer + 4 * i + 0, b);
      vst1q_s32(acc_buffer + 4 * i + 4, b);
      vst1q_s32(acc_buffer + 4 * i + 8, b);
      vst1q_s32(acc_buffer + 4 * i + 12, b);
    }
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 4 * i, b);
    }
  }
  else if (output_depth == 8)
  {
    const int32x4_t b0 = vld1q_s32(bias_data);
    const int32x4_t b1 = vld1q_s32(bias_data + 4);
    for (; i <= num_output_pixels - 2; i += 2)
    {
      vst1q_s32(acc_buffer + 8 * i + 0, b0);
      vst1q_s32(acc_buffer + 8 * i + 4, b1);
      vst1q_s32(acc_buffer + 8 * i + 8, b0);
      vst1q_s32(acc_buffer + 8 * i + 12, b1);
    }
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 8 * i + 0, b0);
      vst1q_s32(acc_buffer + 8 * i + 4, b1);
    }
  }
  else if (output_depth == 16)
  {
    const int32x4_t b0 = vld1q_s32(bias_data);
    const int32x4_t b1 = vld1q_s32(bias_data + 4);
    const int32x4_t b2 = vld1q_s32(bias_data + 8);
    const int32x4_t b3 = vld1q_s32(bias_data + 12);
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 16 * i + 0, b0);
      vst1q_s32(acc_buffer + 16 * i + 4, b1);
      vst1q_s32(acc_buffer + 16 * i + 8, b2);
      vst1q_s32(acc_buffer + 16 * i + 12, b3);
    }
  }
#endif
  for (; i < num_output_pixels; i++)
  {
    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
  }
}

Referenced by DepthwiseConvGeneral().

◆ QuantizedDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>

void nnfw::cker::optimized_integer_ops::depthwise_conv::QuantizedDepthwiseConvAccumRow	(	int	stride,
		int	dilation_factor,
		int	input_depth,
		int	input_width,
		const int8_t *	input_data,
		int16_t	input_offset,
		int	pad_width,
		int	depth_multiplier,
		int	filter_width,
		const int8_t *	filter_data,
		int	out_x_buffer_start,
		int	out_x_buffer_end,
		int	output_depth,
		int32_t *	acc_buffer
	)

Definition at line 1539 of file DepthwiseConvInt8.h.

{
  // Consistency check parameters. This is important in particular to ensure
  // that we keep the number of template instantiations minimal, so we don't
  // increase binary size unnecessarily.
  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
  static_assert(kFixedInputDepth || kAllowStrided, "");
  assert(stride == 1 || kAllowStrided);
  if (kFixedInputDepth)
  {
    assert(input_depth == kFixedInputDepth);
  }
  if (kFixedDepthMultiplier)
  {
    assert(depth_multiplier == kFixedDepthMultiplier);
  }
  assert(output_depth == input_depth * depth_multiplier);
  const int input_ptr_increment = stride * input_depth;
  const int8_t *filter_base_ptr = filter_data;
  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
  {
    // For the current (filter_x, filter_y) point in the filter,
    // compute the boundaries of the corresponding output row segment.
    int out_x_loop_start_unclamped = 0;
    int out_x_loop_end_unclamped = 0;
    if (kAllowStrided)
    {
      if (stride == 2)
      {
        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
      }
      else if (stride == 4)
      {
        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
      }
      else
      {
        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
        out_x_loop_end_unclamped =
          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
      }
    }
    else
    {
      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
    }
    // The kernel will have to iterate on the segment of the
    // output row that starts at out_x_loop_start and out_x_loop_end.
    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
 
    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
      input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
    filter_base_ptr += output_depth;
  }
}

◆ QuantizedDepthwiseConvAccumRowGeneric()

void nnfw::cker::optimized_integer_ops::depthwise_conv::QuantizedDepthwiseConvAccumRowGeneric	(	int	stride,
		int	dilation_factor,
		int	input_depth,
		int	input_width,
		const int8_t *	input_data,
		int16_t	input_offset,
		int	pad_width,
		int	depth_multiplier,
		int	filter_width,
		const int8_t *	filter_data,
		int	out_x_buffer_start,
		int	out_x_buffer_end,
		int	output_depth,
		int32_t *	acc_buffer
	)

inline

Definition at line 1609 of file DepthwiseConvInt8.h.

{
  const int8_t *filter_base_ptr = filter_data;
  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
  {
    const int out_x_loop_start =
      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
    const int out_x_loop_end =
      std::min(out_x_buffer_end,
               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 
    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
    const int input_ptr_increment = (stride - 1) * input_depth;
    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
    {
      const int8_t *filter_ptr = filter_base_ptr;
      for (int ic = 0; ic < input_depth; ++ic)
      {
        const int16_t input_val = *input_ptr++ + input_offset;
        for (int m = 0; m < depth_multiplier; m++)
        {
          const int16_t filter_val = *filter_ptr++;
          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
        }
      }
      input_ptr += input_ptr_increment;
    }
    filter_base_ptr += output_depth;
  }
}

References m.

Referenced by DepthwiseConvGeneral().

Data Structures

Functions

Function Documentation

◆ DepthwiseConvGeneral()

◆ DepthwiseConvInitAccBuffer()

◆ QuantizedDepthwiseConvAccumRow()

◆ QuantizedDepthwiseConvAccumRowGeneric()