Data Structures
struct	QuantizedDepthwiseConvKernel

Functions
template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void	QuantizedDepthwiseConvAccumRow (int stride, int dilation_factor, int input_depth, int input_width, const uint8_t input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)

void	QuantizedDepthwiseConvAccumRowGeneric (int stride, int dilation_factor, int input_depth, int input_width, const uint8_t input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)

void	DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const int32_t bias_data, int32_t acc_buffer)

void	DepthwiseConvGeneral (const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t input_data, const Shape &filter_shape, const uint8_t filter_data, const Shape &bias_shape, const int32_t bias_data, const Shape &output_shape, uint8_t output_data, int thread_start, int thread_end, int thread_dim)

Function Documentation

◆ DepthwiseConvGeneral()

void nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral	(	const DepthwiseConvParams &	params,
		const Shape &	input_shape,
		const uint8_t *	input_data,
		const Shape &	filter_shape,
		const uint8_t *	filter_data,
		const Shape &	bias_shape,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		uint8_t *	output_data,
		int	thread_start,
		int	thread_end,
		int	thread_dim
	)

inline

Definition at line 1814 of file DepthwiseConvUint8.h.

{
  (void)bias_shape;
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
  const int depth_multiplier = params.depth_multiplier;
  const int32_t output_activation_min = params.quantized_activation_min;
  const int32_t output_activation_max = params.quantized_activation_max;
  const int32_t input_offset = params.input_offset;
  const int32_t filter_offset = params.weights_offset;
  const int32_t output_offset = params.output_offset;
  const int32_t output_multiplier = params.output_multiplier;
  const int output_shift = params.output_shift;
  const int dilation_width_factor = params.dilation_width_factor;
  const int dilation_height_factor = params.dilation_height_factor;
  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
  const int input_height = input_shape.Dims(1);
  const int input_width = input_shape.Dims(2);
  const int input_depth = input_shape.Dims(3);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int output_height = output_shape.Dims(1);
  const int output_width = output_shape.Dims(2);
#ifdef USE_NEON
  const bool shift_left = (output_shift > 0);
  const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
#endif
 
  static const int kAccBufferMaxSize = 2048;
  int32_t acc_buffer[kAccBufferMaxSize];
  assert(kAccBufferMaxSize >= output_depth);
  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
  [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
  assert(kAccBufferActualSize <= kAccBufferMaxSize);
  assert(kOutputPixelsInAccBuffer >= 1);
  assert(thread_dim == 0 || thread_dim == 1);
 
  // row_accum_func will point to the core accumulation function to be used
  // for this DepthwiseConv op.
  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
  row_accum_func_t row_accum_func = nullptr;
 
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
  {                                                                                               \
    row_accum_func =                                                                              \
      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
  }
 
#ifdef USE_NEON
  // We go over our list of kernels by decreasing order of preference
  // for the cases where multiple kernels could apply.
 
  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
 
  // Next come the strided kernels: AllowStrided=true, fixed input depth.
  // They are a bit less efficient, but allow stride!=1.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
 
  // Finally, the kernels allowing a variable input depth,
  // these are the least efficient but most general kernels.
 
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
#endif // USE_NEON
 
  // No matching fast kernel found, use slow fallback.
  if (!row_accum_func)
  {
    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
  }
 
#undef TFMINI_USE_DEPTHWISECONV_KERNEL
 
  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
 
  // Now that we have determined row_accum_func, we can start work.
  int batch_start = 0;
  int batch_end = batches;
  int row_start = 0;
  int row_end = output_height;
  int output_ptr_offset = 0;
 
  switch (thread_dim)
  {
    case 0:
      // Multithread along with the batch axis
      assert(thread_start >= 0);
      assert(thread_end <= batches);
      batch_start = thread_start;
      batch_end = thread_end;
      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
      break;
    case 1:
      // Multithread along with the row axis
      assert(thread_start >= 0);
      assert(thread_end <= output_height);
      row_start = thread_start;
      row_end = thread_end;
      output_ptr_offset = row_start * output_width * output_depth;
      break;
  }
 
  uint8_t *output_ptr = output_data + output_ptr_offset;
  int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
  for (int b = batch_start; b < batch_end; ++b)
  {
    for (int out_y = row_start; out_y < row_end; ++out_y)
    {
      const int in_y_origin = (out_y * stride_height) - pad_height;
      const int filter_y_start =
        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
      const int filter_y_end =
        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
                                  dilation_height_factor);
      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
           out_x_buffer_start += kOutputPixelsInAccBuffer)
      {
        const int out_x_buffer_end =
          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
        // We call a 'pixel' a group of activation that share all but the
        // 'depth'/'channel' coordinate. num_output_pixels is the number of
        // output pixels that we will accumulate in this loop iteration.
        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
        // Initialize our local accumulator with the bias values, so we don't
        // have to add them later.
        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
        // Accumulation loop. Most of the time should be spent in here.
        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
        {
          const int in_y = in_y_origin + dilation_height_factor * filter_y;
          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
                         input_data + in_y * input_height_stride + b * input_batch_stride,
                         input_offset, pad_width, depth_multiplier, filter_width,
                         filter_data + filter_y * filter_height_stride, filter_offset,
                         out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
        }
        // Finished accumulating int32_t values. Now need to convert them to
        // the final 8bit form and store them.
        const int num_output_values = output_depth * num_output_pixels;
        int i = 0;
#ifdef USE_NEON
        using gemmlowp::RoundingDivideByPOT;
        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
        const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
        const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
        // Handle 16 values at once.
        // This allows us to issue 4 mutually independent int32
        // multiplications (vqrdmulh), which should alleviate most of their
        // high latency.
        for (; i <= num_output_values - 16; i += 16)
        {
          int32x4_t acc[4];
          for (int j = 0; j < 4; j++)
          {
            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
          }
 
          if (!shift_left)
          {
            // Fixed-point multiplication.
            for (int j = 0; j < 4; j++)
            {
              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
            }
            for (int j = 0; j < 4; j++)
            {
              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
            }
          }
          else
          {
            // Fixed-point multiplication.
            for (int j = 0; j < 4; j++)
            {
              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
            }
          }
          // Add the output offset.
          for (int j = 0; j < 4; j++)
          {
            acc[j] = vaddq_s32(acc[j], output_offset_vec);
          }
          // Apply the activation function.
          for (int j = 0; j < 4; j++)
          {
            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
          }
          for (int j = 0; j < 4; j++)
          {
            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
          }
          // Saturating cast to uint8_t and store to destination.
          int16x4_t acc_s16[4];
          for (int j = 0; j < 4; j++)
          {
            acc_s16[j] = vqmovn_s32(acc[j]);
          }
          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
          output_ptr += 16;
        }
        // Handle 8 values at once.
        // Not as good as 16 (now we're only issuing 2 mutually independent
        // vqrdmulh instructions, so we're probably paying for their high
        // latency).
        for (; i <= num_output_values - 8; i += 8)
        {
          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
          if (!shift_left)
          {
            // Fixed-point multiplication.
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            // Rounding right shift.
            acc0 = RoundingDivideByPOT(acc0, -output_shift);
            acc1 = RoundingDivideByPOT(acc1, -output_shift);
          }
          else
          {
            // Fixed-point multiplication.
            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
 
            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
          }
          // Add the output offset.
          acc0 = vaddq_s32(acc0, output_offset_vec);
          acc1 = vaddq_s32(acc1, output_offset_vec);
          // Apply the activation function.
          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
          acc0 = vminq_s32(acc0, output_activation_max_vec);
          acc1 = vminq_s32(acc1, output_activation_max_vec);
          // Saturating cast to uint8_t and store to destination.
          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
          vst1_u8(output_ptr, res_u8);
          output_ptr += 8;
        }
        // Handle 4 values at once. Now we're paying the full price of the
        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
        // (without any alignment) can only be done 1 byte at a time.
        // Yet, that is still worth doing to minimize the amount of leftover
        // that will have to go through the very slow scalar code.
        for (; i <= num_output_values - 4; i += 4)
        {
          int32x4_t acc = vld1q_s32(acc_buffer + i);
          if (!shift_left)
          {
            // Fixed-point multiplication.
            acc = vqrdmulhq_n_s32(acc, output_multiplier);
            // Rounding right shift.
            acc = RoundingDivideByPOT(acc, -output_shift);
          }
          else
          {
            // Fixed-point multiplication.
            acc = vmulq_n_s32(acc, multiplier_power_of_two);
            acc = vqrdmulhq_n_s32(acc, output_multiplier);
          }
          // Add the output offset.
          acc = vaddq_s32(acc, output_offset_vec);
          // Apply the activation function.
          acc = vmaxq_s32(acc, output_activation_min_vec);
          acc = vminq_s32(acc, output_activation_max_vec);
          // Saturating cast to uint8_t and store to destination.
          const int16x4_t acc_s16 = vqmovn_s32(acc);
          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
          vst1_lane_u8(output_ptr + 0, res_u8, 0);
          vst1_lane_u8(output_ptr + 1, res_u8, 1);
          vst1_lane_u8(output_ptr + 2, res_u8, 2);
          vst1_lane_u8(output_ptr + 3, res_u8, 3);
          output_ptr += 4;
        }
#endif // USE_NEON
 
        // Handle leftover values, one by one. This is very slow.
        for (; i < num_output_values; i++)
        {
          int32_t acc = acc_buffer[i];
          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
          acc += output_offset;
          acc = std::max(acc, output_activation_min);
          acc = std::min(acc, output_activation_max);
          *output_ptr++ = static_cast<uint8_t>(acc);
        }
      }
    }
    output_ptr += batch_step;
  }
}

Referenced by nnfw::cker::optimized::DepthwiseConvWithRounding().

◆ DepthwiseConvInitAccBuffer()

void nnfw::cker::optimized::depthwise_conv::DepthwiseConvInitAccBuffer	(	int	num_output_pixels,
		int	output_depth,
		const int32_t *	bias_data,
		int32_t *	acc_buffer
	)

inline

Definition at line 1724 of file DepthwiseConvUint8.h.

{
  int i = 0;
#ifdef USE_NEON
  if (output_depth == 1)
  {
    const int32x4_t b = vdupq_n_s32(bias_data[0]);
    for (; i <= num_output_pixels - 16; i += 16)
    {
      vst1q_s32(acc_buffer + i + 0, b);
      vst1q_s32(acc_buffer + i + 4, b);
      vst1q_s32(acc_buffer + i + 8, b);
      vst1q_s32(acc_buffer + i + 12, b);
    }
    for (; i <= num_output_pixels - 4; i += 4)
    {
      vst1q_s32(acc_buffer + i, b);
    }
  }
  else if (output_depth == 2)
  {
    int32x4_t b = vdupq_n_s32(bias_data[0]);
    b = vsetq_lane_s32(bias_data[1], b, 1);
    b = vsetq_lane_s32(bias_data[1], b, 3);
    for (; i <= num_output_pixels - 8; i += 8)
    {
      vst1q_s32(acc_buffer + 2 * i + 0, b);
      vst1q_s32(acc_buffer + 2 * i + 4, b);
      vst1q_s32(acc_buffer + 2 * i + 8, b);
      vst1q_s32(acc_buffer + 2 * i + 12, b);
    }
    for (; i <= num_output_pixels - 2; i += 2)
    {
      vst1q_s32(acc_buffer + 2 * i, b);
    }
  }
  else if (output_depth == 4)
  {
    const int32x4_t b = vld1q_s32(bias_data);
    for (; i <= num_output_pixels - 4; i += 4)
    {
      vst1q_s32(acc_buffer + 4 * i + 0, b);
      vst1q_s32(acc_buffer + 4 * i + 4, b);
      vst1q_s32(acc_buffer + 4 * i + 8, b);
      vst1q_s32(acc_buffer + 4 * i + 12, b);
    }
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 4 * i, b);
    }
  }
  else if (output_depth == 8)
  {
    const int32x4_t b0 = vld1q_s32(bias_data);
    const int32x4_t b1 = vld1q_s32(bias_data + 4);
    for (; i <= num_output_pixels - 2; i += 2)
    {
      vst1q_s32(acc_buffer + 8 * i + 0, b0);
      vst1q_s32(acc_buffer + 8 * i + 4, b1);
      vst1q_s32(acc_buffer + 8 * i + 8, b0);
      vst1q_s32(acc_buffer + 8 * i + 12, b1);
    }
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 8 * i + 0, b0);
      vst1q_s32(acc_buffer + 8 * i + 4, b1);
    }
  }
  else if (output_depth == 16)
  {
    const int32x4_t b0 = vld1q_s32(bias_data);
    const int32x4_t b1 = vld1q_s32(bias_data + 4);
    const int32x4_t b2 = vld1q_s32(bias_data + 8);
    const int32x4_t b3 = vld1q_s32(bias_data + 12);
    for (; i < num_output_pixels; i++)
    {
      vst1q_s32(acc_buffer + 16 * i + 0, b0);
      vst1q_s32(acc_buffer + 16 * i + 4, b1);
      vst1q_s32(acc_buffer + 16 * i + 8, b2);
      vst1q_s32(acc_buffer + 16 * i + 12, b3);
    }
  }
#endif
  for (; i < num_output_pixels; i++)
  {
    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
  }
}

Referenced by DepthwiseConvGeneral().

◆ QuantizedDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>

void nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRow	(	int	stride,
		int	dilation_factor,
		int	input_depth,
		int	input_width,
		const uint8_t *	input_data,
		int16_t	input_offset,
		int	pad_width,
		int	depth_multiplier,
		int	filter_width,
		const uint8_t *	filter_data,
		int16_t	filter_offset,
		int	out_x_buffer_start,
		int	out_x_buffer_end,
		int	output_depth,
		int32_t *	acc_buffer
	)

Definition at line 1612 of file DepthwiseConvUint8.h.

{
  // Sanity check parameters. This is important in particular to ensure
  // that we keep the number of template instantiations minimal, so we don't
  // increase binary size unnecessarily.
  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
  static_assert(kFixedInputDepth || kAllowStrided, "");
  assert(stride == 1 || kAllowStrided);
  if (kFixedInputDepth)
  {
    assert(input_depth == kFixedInputDepth);
  }
  if (kFixedDepthMultiplier)
  {
    assert(depth_multiplier == kFixedDepthMultiplier);
  }
  assert(output_depth == input_depth * depth_multiplier);
  const int input_ptr_increment = stride * input_depth;
  const uint8_t *filter_base_ptr = filter_data;
  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
  {
    // For the current (filter_x, filter_y) point in the filter,
    // compute the boundaries of the corresponding output row segment.
    int out_x_loop_start_unclampled = 0;
    int out_x_loop_end_unclampled = 0;
    if (kAllowStrided)
    {
      if (stride == 2)
      {
        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
      }
      else if (stride == 4)
      {
        out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
        out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
      }
      else
      {
        out_x_loop_start_unclampled =
          (pad_width - dilation_factor * filter_x + stride - 1) / stride;
        out_x_loop_end_unclampled =
          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
      }
    }
    else
    {
      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
      out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
    }
    // The kernel will have to iterate on the segment of the
    // output row that starts at out_x_loop_start and out_x_loop_end.
    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
 
    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
      input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
    filter_base_ptr += output_depth;
  }
}

◆ QuantizedDepthwiseConvAccumRowGeneric()

void nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRowGeneric	(	int	stride,
		int	dilation_factor,
		int	input_depth,
		int	input_width,
		const uint8_t *	input_data,
		int16_t	input_offset,
		int	pad_width,
		int	depth_multiplier,
		int	filter_width,
		const uint8_t *	filter_data,
		int16_t	filter_offset,
		int	out_x_buffer_start,
		int	out_x_buffer_end,
		int	output_depth,
		int32_t *	acc_buffer
	)

inline

Definition at line 1684 of file DepthwiseConvUint8.h.

{
  const uint8_t *filter_base_ptr = filter_data;
  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
  {
    const int out_x_loop_start =
      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
    const int out_x_loop_end =
      std::min(out_x_buffer_end,
               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
 
    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
    const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
    const int input_ptr_increment = (stride - 1) * input_depth;
    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
    {
      const uint8_t *filter_ptr = filter_base_ptr;
      for (int ic = 0; ic < input_depth; ++ic)
      {
        const int16_t input_val = *input_ptr++ + input_offset;
        for (int m = 0; m < depth_multiplier; m++)
        {
          const int16_t filter_val = *filter_ptr++ + filter_offset;
          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
        }
      }
      input_ptr += input_ptr_increment;
    }
    filter_base_ptr += output_depth;
  }
}

References m.

Referenced by DepthwiseConvGeneral().

Data Structures

Functions

Function Documentation

◆ DepthwiseConvGeneral()

◆ DepthwiseConvInitAccBuffer()

◆ QuantizedDepthwiseConvAccumRow()

◆ QuantizedDepthwiseConvAccumRowGeneric()