Namespaces
namespace	depthwise_conv

Data Structures
struct	DepthwiseConvWorkerTask

Enumerations
enum class	DepthwiseConvOutputRounding { kNone = 0 , kAwayFromZero , kUpward }

enum class	DepthwiseConvDepthMultiplication { kNoMultiplication = 0 , kUnitInputDepth }

Functions
template<DepthwiseConvOutputRounding kOutputRounding>
void	DepthwiseConvWithRounding (const DepthwiseConvParams &params, const int32_t output_multiplier, const int32_t output_shift, const Shape &input_shape, const int8_t input_data, const Shape &filter_shape, const int8_t filter_data, const Shape &bias_shape, const int32_t bias_data, const Shape &output_shape, int8_t output_data, int thread_start, int thread_end, int thread_dim)

void	DepthwiseConvImpl (const DepthwiseConvParams &params, const int32_t output_multiplier, const int32_t output_shift, const Shape &input_shape, const int8_t input_data, const Shape &filter_shape, const int8_t filter_data, const Shape &bias_shape, const int32_t bias_data, const Shape &output_shape, int8_t output_data, int thread_start, int thread_end, int thread_dim)

int	HowManyConvThreads (const Shape &output_shape, const Shape &filter_shape, int thread_dim)

void	DepthwiseConvPerChannel (const DepthwiseConvParams &params, const int32_t output_multiplier, const int32_t output_shift, const Shape &input_shape, const int8_t input_data, const Shape &filter_shape, const int8_t filter_data, const Shape &bias_shape, const int32_t bias_data, const Shape &output_shape, int8_t output_data, ruy::Context *ruy_context)

Enumeration Type Documentation

◆ DepthwiseConvDepthMultiplication

enum class nnfw::cker::optimized_integer_ops::DepthwiseConvDepthMultiplication

strong

Enumerator
kNoMultiplication
kUnitInputDepth

Definition at line 48 of file DepthwiseConvInt8.h.

{
  kNoMultiplication = 0, // Depth multiplier = 1.
  kUnitInputDepth,       // Input depth = 1, output depth = depth multiplier.
};

◆ DepthwiseConvOutputRounding

enum class nnfw::cker::optimized_integer_ops::DepthwiseConvOutputRounding

strong

Enumerator
kNone
kAwayFromZero
kUpward

Definition at line 39 of file DepthwiseConvInt8.h.

{
  kNone = 0,     // Invalid: specific method must be specified.
  kAwayFromZero, // Original method: exact halves rounded away from zero.
  kUpward,       // Halves towards +infinity: adds 0.5 before truncate.
  // This is where a future kNearestEven would be placed.
};

Function Documentation

◆ DepthwiseConvImpl()

void nnfw::cker::optimized_integer_ops::DepthwiseConvImpl	(	const DepthwiseConvParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	filter_shape,
		const int8_t *	filter_data,
		const Shape &	bias_shape,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		int8_t *	output_data,
		int	thread_start,
		int	thread_end,
		int	thread_dim
	)

inline

Definition at line 2000 of file DepthwiseConvInt8.h.

{
  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
}

References output_shape.

Referenced by DepthwiseConvPerChannel(), and nnfw::cker::optimized_integer_ops::DepthwiseConvWorkerTask< T, TS >::Run().

◆ DepthwiseConvPerChannel()

void nnfw::cker::optimized_integer_ops::DepthwiseConvPerChannel	(	const DepthwiseConvParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	filter_shape,
		const int8_t *	filter_data,
		const Shape &	bias_shape,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		int8_t *	output_data,
		ruy::Context *	ruy_context
	)

inline

Definition at line 2064 of file DepthwiseConvInt8.h.

{
  UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
             filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
 
  assert(input_shape.DimensionsCount() == 4);
  assert(filter_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
 
  const int output_batches = output_shape.Dims(0);
  const int output_rows = output_shape.Dims(1);
  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
  int thread_dim, thread_count, thread_dim_size;
  if (thread_count_batch > thread_count_row)
  {
    thread_dim = 0;
    thread_dim_size = output_batches;
    thread_count = thread_count_batch;
  }
  else
  {
    thread_dim = 1;
    thread_dim_size = output_rows;
    thread_count = thread_count_row;
  }
 
  // NOTE Borrow RuyContext to get max_num_threads setting
  // TODO Define and use max_num_threads for CPU backend
  const int max_threads = ruy_context->max_num_threads();
  thread_count = std::max(1, std::min(thread_count, max_threads));
 
  if (thread_count == 1)
  {
    DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
                      filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
                      /*thread_start=*/0,
                      /*thread_end=*/output_rows, /*thread_dim=*/1);
  }
  else
  {
    std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
    // TODO(b/131746020) don't create new heap allocations every time.
    // At least we make it a single heap allocation by using reserve().
    tasks.reserve(thread_count);
    int thread_start = 0;
    for (int i = 0; i < thread_count; ++i)
    {
      int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
      tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
                         filter_shape, filter_data, bias_shape, bias_data, output_shape,
                         output_data, thread_start, thread_end, thread_dim);
      thread_start = thread_end;
    }
    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
  }
}

References DepthwiseConvImpl(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::cpu_backend_threadpool::Execute(), HowManyConvThreads(), and output_shape.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convQ8i().

◆ DepthwiseConvWithRounding()

template<DepthwiseConvOutputRounding kOutputRounding>

void nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding	(	const DepthwiseConvParams &	params,
		const int32_t *	output_multiplier,
		const int32_t *	output_shift,
		const Shape &	input_shape,
		const int8_t *	input_data,
		const Shape &	filter_shape,
		const int8_t *	filter_data,
		const Shape &	bias_shape,
		const int32_t *	bias_data,
		const Shape &	output_shape,
		int8_t *	output_data,
		int	thread_start,
		int	thread_end,
		int	thread_dim
	)

inline

Definition at line 1918 of file DepthwiseConvInt8.h.

{
  [[maybe_unused]] const int depth_multiplier = params.depth_multiplier;
  [[maybe_unused]] const int dilation_width_factor = params.dilation_width_factor;
  [[maybe_unused]] const int dilation_height_factor = params.dilation_height_factor;
  assert(dilation_width_factor >= 1);
  assert(dilation_height_factor >= 1);
  assert(input_shape.DimensionsCount() == 4);
  assert(filter_shape.DimensionsCount() == 4);
  assert(output_shape.DimensionsCount() == 4);
  [[maybe_unused]] const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
  [[maybe_unused]] const int input_depth = input_shape.Dims(3);
  assert(output_depth == input_depth * depth_multiplier);
  assert(bias_shape.FlatSize() == output_depth);
 
//  TODO Use below codes
#if 0
// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
// Jetson TX-2. This compiler does not support the offsetof() macro.
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#if defined(__ANDROID__) && defined(__clang__)
  CpuFlags cpu_flags;
  GetCpuFlags(&cpu_flags);
  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
 
  // Dispatch to dot-product 3x3 kernels when supported.
  if (has_dot_product_instructions)
  {
    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
    DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
      optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
      input_shape, filter_shape, output_shape, params, output_shift);
    if (kernel_type != DotProduct3x3KernelType::kNone)
    {
      DepthwiseConvParams params_copy = params;
      params_copy.output_shift_per_channel = output_shift;
      params_copy.output_multiplier_per_channel = output_multiplier;
      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
        DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
        params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
        output_shape, output_data, thread_start, thread_end, thread_dim);
      return;
    }
  }
 
#endif
  // Dispatch to non-dot-product 3x3 kernels when supported.
 
  const int stride_width = params.stride_width;
  const int stride_height = params.stride_height;
  const int pad_width = params.padding_values.width;
  const int pad_height = params.padding_values.height;
 
  // Call kernel optimized for depthwise convolutions using 3x3 filters if
  // parameters are supported.
  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
        optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
        input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
        dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
        output_shift))
  {
    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
      DepthwiseConvOutputRounding::kUpward>(
      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
      bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
    return;
  }
#endif
 
#endif /* end of if 0 */
 
  depthwise_conv::DepthwiseConvGeneral(
    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
}

References nnfw::cker::DepthwiseConvParams::depth_multiplier, nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), nnfw::cker::PaddingValues::height, kUpward, nnfw::cker::MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ HowManyConvThreads()

int nnfw::cker::optimized_integer_ops::HowManyConvThreads	(	const Shape &	output_shape,
		const Shape &	filter_shape,
		int	thread_dim
	)

inline

Definition at line 2051 of file DepthwiseConvInt8.h.

{
  constexpr int kMinMulPerThread = 8;
  const int output_units = output_shape.Dims(thread_dim);
  const int filter_height = filter_shape.Dims(1);
  const int filter_width = filter_shape.Dims(2);
  const int num_mul_per_unit =
    FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
  int thread_count = output_units / min_units_per_thread;
  return thread_count;
}

References nnfw::cker::Shape::Dims(), nnfw::cker::FlatSizeSkipDim(), and output_shape.

Referenced by DepthwiseConvPerChannel().

Namespaces

Data Structures

Enumerations

Functions

Enumeration Type Documentation

◆ DepthwiseConvDepthMultiplication

◆ DepthwiseConvOutputRounding

Function Documentation

◆ DepthwiseConvImpl()

◆ DepthwiseConvPerChannel()

◆ DepthwiseConvWithRounding()

◆ HowManyConvThreads()