ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::optimized_integer_ops Namespace Reference

Namespaces

namespace  depthwise_conv
 

Data Structures

struct  DepthwiseConvWorkerTask
 

Enumerations

enum class  DepthwiseConvOutputRounding { kNone = 0 , kAwayFromZero , kUpward }
 
enum class  DepthwiseConvDepthMultiplication { kNoMultiplication = 0 , kUnitInputDepth }
 

Functions

template<DepthwiseConvOutputRounding kOutputRounding>
void DepthwiseConvWithRounding (const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, int8_t *output_data, int thread_start, int thread_end, int thread_dim)
 
void DepthwiseConvImpl (const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, int8_t *output_data, int thread_start, int thread_end, int thread_dim)
 
int HowManyConvThreads (const Shape &output_shape, const Shape &filter_shape, int thread_dim)
 
void DepthwiseConvPerChannel (const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, int8_t *output_data, ruy::Context *ruy_context)
 

Enumeration Type Documentation

◆ DepthwiseConvDepthMultiplication

Enumerator
kNoMultiplication 
kUnitInputDepth 

Definition at line 48 of file DepthwiseConvInt8.h.

49{
50 kNoMultiplication = 0, // Depth multiplier = 1.
51 kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
52};

◆ DepthwiseConvOutputRounding

Enumerator
kNone 
kAwayFromZero 
kUpward 

Definition at line 39 of file DepthwiseConvInt8.h.

40{
41 kNone = 0, // Invalid: specific method must be specified.
42 kAwayFromZero, // Original method: exact halves rounded away from zero.
43 kUpward, // Halves towards +infinity: adds 0.5 before truncate.
44 // This is where a future kNearestEven would be placed.
45};

Function Documentation

◆ DepthwiseConvImpl()

void nnfw::cker::optimized_integer_ops::DepthwiseConvImpl ( const DepthwiseConvParams params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const Shape input_shape,
const int8_t *  input_data,
const Shape filter_shape,
const int8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
int8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 2000 of file DepthwiseConvInt8.h.

2006{
2007 return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
2008 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
2009 bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
2010}
const luci_interpreter::RuntimeShape output_shape

References output_shape.

Referenced by DepthwiseConvPerChannel(), and nnfw::cker::optimized_integer_ops::DepthwiseConvWorkerTask< T, TS >::Run().

◆ DepthwiseConvPerChannel()

void nnfw::cker::optimized_integer_ops::DepthwiseConvPerChannel ( const DepthwiseConvParams params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const Shape input_shape,
const int8_t *  input_data,
const Shape filter_shape,
const int8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
int8_t *  output_data,
ruy::Context *  ruy_context 
)
inline

Definition at line 2064 of file DepthwiseConvInt8.h.

2071{
2072 UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
2073 filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
2074
2075 assert(input_shape.DimensionsCount() == 4);
2076 assert(filter_shape.DimensionsCount() == 4);
2077 assert(output_shape.DimensionsCount() == 4);
2078
2079 const int output_batches = output_shape.Dims(0);
2080 const int output_rows = output_shape.Dims(1);
2081 int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
2082 int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
2083 int thread_dim, thread_count, thread_dim_size;
2084 if (thread_count_batch > thread_count_row)
2085 {
2086 thread_dim = 0;
2087 thread_dim_size = output_batches;
2088 thread_count = thread_count_batch;
2089 }
2090 else
2091 {
2092 thread_dim = 1;
2093 thread_dim_size = output_rows;
2094 thread_count = thread_count_row;
2095 }
2096
2097 // NOTE Borrow RuyContext to get max_num_threads setting
2098 // TODO Define and use max_num_threads for CPU backend
2099 const int max_threads = ruy_context->max_num_threads();
2100 thread_count = std::max(1, std::min(thread_count, max_threads));
2101
2102 if (thread_count == 1)
2103 {
2104 DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
2105 filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
2106 /*thread_start=*/0,
2107 /*thread_end=*/output_rows, /*thread_dim=*/1);
2108 }
2109 else
2110 {
2111 std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
2112 // TODO(b/131746020) don't create new heap allocations every time.
2113 // At least we make it a single heap allocation by using reserve().
2114 tasks.reserve(thread_count);
2115 int thread_start = 0;
2116 for (int i = 0; i < thread_count; ++i)
2117 {
2118 int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
2119 tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
2120 filter_shape, filter_data, bias_shape, bias_data, output_shape,
2121 output_data, thread_start, thread_end, thread_dim);
2122 thread_start = thread_end;
2123 }
2124 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
2125 }
2126}
int32_t DimensionsCount() const
Definition Shape.h:91
int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)

References DepthwiseConvImpl(), nnfw::cker::Shape::DimensionsCount(), nnfw::cker::cpu_backend_threadpool::Execute(), HowManyConvThreads(), and output_shape.

Referenced by onert::backend::cpu::ops::DepthwiseConvolutionLayer::convQ8i().

◆ DepthwiseConvWithRounding()

template<DepthwiseConvOutputRounding kOutputRounding>
void nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding ( const DepthwiseConvParams params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const Shape input_shape,
const int8_t *  input_data,
const Shape filter_shape,
const int8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
int8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 1918 of file DepthwiseConvInt8.h.

1925{
1926 [[maybe_unused]] const int depth_multiplier = params.depth_multiplier;
1927 [[maybe_unused]] const int dilation_width_factor = params.dilation_width_factor;
1928 [[maybe_unused]] const int dilation_height_factor = params.dilation_height_factor;
1929 assert(dilation_width_factor >= 1);
1930 assert(dilation_height_factor >= 1);
1931 assert(input_shape.DimensionsCount() == 4);
1932 assert(filter_shape.DimensionsCount() == 4);
1933 assert(output_shape.DimensionsCount() == 4);
1934 [[maybe_unused]] const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1935 [[maybe_unused]] const int input_depth = input_shape.Dims(3);
1936 assert(output_depth == input_depth * depth_multiplier);
1937 assert(bias_shape.FlatSize() == output_depth);
1938
1939// TODO Use below codes
1940#if 0
1941// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
1942// Jetson TX-2. This compiler does not support the offsetof() macro.
1943#if defined(__aarch64__) && !defined(GOOGLE_L4T)
1944#if defined(__ANDROID__) && defined(__clang__)
1945 CpuFlags cpu_flags;
1946 GetCpuFlags(&cpu_flags);
1947 const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
1948
1949 // Dispatch to dot-product 3x3 kernels when supported.
1950 if (has_dot_product_instructions)
1951 {
1952 using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
1953 DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
1954 optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
1955 input_shape, filter_shape, output_shape, params, output_shift);
1956 if (kernel_type != DotProduct3x3KernelType::kNone)
1957 {
1958 DepthwiseConvParams params_copy = params;
1959 params_copy.output_shift_per_channel = output_shift;
1960 params_copy.output_multiplier_per_channel = output_multiplier;
1961 optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
1962 DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
1963 params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
1964 output_shape, output_data, thread_start, thread_end, thread_dim);
1965 return;
1966 }
1967 }
1968
1969#endif
1970 // Dispatch to non-dot-product 3x3 kernels when supported.
1971
1972 const int stride_width = params.stride_width;
1973 const int stride_height = params.stride_height;
1974 const int pad_width = params.padding_values.width;
1975 const int pad_height = params.padding_values.height;
1976
1977 // Call kernel optimized for depthwise convolutions using 3x3 filters if
1978 // parameters are supported.
1979 if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
1980 optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
1981 input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
1982 dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
1983 output_shift))
1984 {
1985 optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
1986 DepthwiseConvOutputRounding::kUpward>(
1987 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
1988 bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
1989 return;
1990 }
1991#endif
1992
1993#endif /* end of if 0 */
1994
1995 depthwise_conv::DepthwiseConvGeneral(
1996 params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
1997 bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
1998}
int32_t Dims(int i) const
Definition Shape.h:92
int FlatSize() const
Definition Shape.h:181
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220

References nnfw::cker::DepthwiseConvParams::depth_multiplier, nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::DimensionsCount(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), nnfw::cker::PaddingValues::height, kUpward, nnfw::cker::MatchingDim(), output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, and nnfw::cker::PaddingValues::width.

◆ HowManyConvThreads()

int nnfw::cker::optimized_integer_ops::HowManyConvThreads ( const Shape output_shape,
const Shape filter_shape,
int  thread_dim 
)
inline

Definition at line 2051 of file DepthwiseConvInt8.h.

2052{
2053 constexpr int kMinMulPerThread = 8;
2054 const int output_units = output_shape.Dims(thread_dim);
2055 const int filter_height = filter_shape.Dims(1);
2056 const int filter_width = filter_shape.Dims(2);
2057 const int num_mul_per_unit =
2058 FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
2059 const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
2060 int thread_count = output_units / min_units_per_thread;
2061 return thread_count;
2062}
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253

References nnfw::cker::Shape::Dims(), nnfw::cker::FlatSizeSkipDim(), and output_shape.

Referenced by DepthwiseConvPerChannel().