ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::optimized::depthwise_conv Namespace Reference

Data Structures

struct  QuantizedDepthwiseConvKernel
 

Functions

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow (int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
 
void QuantizedDepthwiseConvAccumRowGeneric (int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
 
void DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const int32_t *bias_data, int32_t *acc_buffer)
 
void DepthwiseConvGeneral (const DepthwiseConvParams &params, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data, int thread_start, int thread_end, int thread_dim)
 

Function Documentation

◆ DepthwiseConvGeneral()

void nnfw::cker::optimized::depthwise_conv::DepthwiseConvGeneral ( const DepthwiseConvParams params,
const Shape input_shape,
const uint8_t *  input_data,
const Shape filter_shape,
const uint8_t *  filter_data,
const Shape bias_shape,
const int32_t *  bias_data,
const Shape output_shape,
uint8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 1814 of file DepthwiseConvUint8.h.

1820{
1821 (void)bias_shape;
1822 const int stride_width = params.stride_width;
1823 const int stride_height = params.stride_height;
1824 const int pad_width = params.padding_values.width;
1825 const int pad_height = params.padding_values.height;
1826 const int depth_multiplier = params.depth_multiplier;
1827 const int32_t output_activation_min = params.quantized_activation_min;
1828 const int32_t output_activation_max = params.quantized_activation_max;
1829 const int32_t input_offset = params.input_offset;
1830 const int32_t filter_offset = params.weights_offset;
1831 const int32_t output_offset = params.output_offset;
1832 const int32_t output_multiplier = params.output_multiplier;
1833 const int output_shift = params.output_shift;
1834 const int dilation_width_factor = params.dilation_width_factor;
1835 const int dilation_height_factor = params.dilation_height_factor;
1836 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1837 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1838 const int input_height = input_shape.Dims(1);
1839 const int input_width = input_shape.Dims(2);
1840 const int input_depth = input_shape.Dims(3);
1841 const int filter_height = filter_shape.Dims(1);
1842 const int filter_width = filter_shape.Dims(2);
1843 const int output_height = output_shape.Dims(1);
1844 const int output_width = output_shape.Dims(2);
1845#ifdef USE_NEON
1846 const bool shift_left = (output_shift > 0);
1847 const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1848#endif
1849
1850 static const int kAccBufferMaxSize = 2048;
1851 int32_t acc_buffer[kAccBufferMaxSize];
1852 assert(kAccBufferMaxSize >= output_depth);
1853 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1854 [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1855 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1856 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1857 assert(kOutputPixelsInAccBuffer >= 1);
1858 assert(thread_dim == 0 || thread_dim == 1);
1859
1860 // row_accum_func will point to the core accumulation function to be used
1861 // for this DepthwiseConv op.
1862 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1863 row_accum_func_t row_accum_func = nullptr;
1864
1865#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1866 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1867 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1868 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1869 { \
1870 row_accum_func = \
1871 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1872 }
1873
1874#ifdef USE_NEON
1875 // We go over our list of kernels by decreasing order of preference
1876 // for the cases where multiple kernels could apply.
1877
1878 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1879
1890
1891 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1892 // They are a bit less efficient, but allow stride!=1.
1893
1903
1904 // Finally, the kernels allowing a variable input depth,
1905 // these are the least efficient but most general kernels.
1906
1910#endif // USE_NEON
1911
1912 // No matching fast kernel found, use slow fallback.
1913 if (!row_accum_func)
1914 {
1916 }
1917
1918#undef TFMINI_USE_DEPTHWISECONV_KERNEL
1919
1920 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1921 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1922 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1923
1924 // Now that we have determined row_accum_func, we can start work.
1925 int batch_start = 0;
1926 int batch_end = batches;
1927 int row_start = 0;
1928 int row_end = output_height;
1929 int output_ptr_offset = 0;
1930
1931 switch (thread_dim)
1932 {
1933 case 0:
1934 // Multithread along with the batch axis
1935 assert(thread_start >= 0);
1936 assert(thread_end <= batches);
1937 batch_start = thread_start;
1938 batch_end = thread_end;
1939 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1940 break;
1941 case 1:
1942 // Multithread along with the row axis
1943 assert(thread_start >= 0);
1944 assert(thread_end <= output_height);
1945 row_start = thread_start;
1946 row_end = thread_end;
1947 output_ptr_offset = row_start * output_width * output_depth;
1948 break;
1949 }
1950
1951 uint8_t *output_ptr = output_data + output_ptr_offset;
1952 int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1953 for (int b = batch_start; b < batch_end; ++b)
1954 {
1955 for (int out_y = row_start; out_y < row_end; ++out_y)
1956 {
1957 const int in_y_origin = (out_y * stride_height) - pad_height;
1958 const int filter_y_start =
1959 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1960 const int filter_y_end =
1961 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1962 dilation_height_factor);
1963 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1964 out_x_buffer_start += kOutputPixelsInAccBuffer)
1965 {
1966 const int out_x_buffer_end =
1967 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1968 // We call a 'pixel' a group of activation that share all but the
1969 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1970 // output pixels that we will accumulate in this loop iteration.
1971 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1972 // Initialize our local accumulator with the bias values, so we don't
1973 // have to add them later.
1974 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1975 // Accumulation loop. Most of the time should be spent in here.
1976 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1977 {
1978 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1979 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1980 input_data + in_y * input_height_stride + b * input_batch_stride,
1981 input_offset, pad_width, depth_multiplier, filter_width,
1982 filter_data + filter_y * filter_height_stride, filter_offset,
1983 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1984 }
1985 // Finished accumulating int32_t values. Now need to convert them to
1986 // the final 8bit form and store them.
1987 const int num_output_values = output_depth * num_output_pixels;
1988 int i = 0;
1989#ifdef USE_NEON
1990 using gemmlowp::RoundingDivideByPOT;
1991 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1992 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1993 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1994 // Handle 16 values at once.
1995 // This allows us to issue 4 mutually independent int32
1996 // multiplications (vqrdmulh), which should alleviate most of their
1997 // high latency.
1998 for (; i <= num_output_values - 16; i += 16)
1999 {
2000 int32x4_t acc[4];
2001 for (int j = 0; j < 4; j++)
2002 {
2003 acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
2004 }
2005
2006 if (!shift_left)
2007 {
2008 // Fixed-point multiplication.
2009 for (int j = 0; j < 4; j++)
2010 {
2011 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2012 }
2013 for (int j = 0; j < 4; j++)
2014 {
2015 acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
2016 }
2017 }
2018 else
2019 {
2020 // Fixed-point multiplication.
2021 for (int j = 0; j < 4; j++)
2022 {
2023 acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
2024 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2025 }
2026 }
2027 // Add the output offset.
2028 for (int j = 0; j < 4; j++)
2029 {
2030 acc[j] = vaddq_s32(acc[j], output_offset_vec);
2031 }
2032 // Apply the activation function.
2033 for (int j = 0; j < 4; j++)
2034 {
2035 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2036 }
2037 for (int j = 0; j < 4; j++)
2038 {
2039 acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2040 }
2041 // Saturating cast to uint8_t and store to destination.
2042 int16x4_t acc_s16[4];
2043 for (int j = 0; j < 4; j++)
2044 {
2045 acc_s16[j] = vqmovn_s32(acc[j]);
2046 }
2047 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2048 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2049 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2050 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2051 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2052 output_ptr += 16;
2053 }
2054 // Handle 8 values at once.
2055 // Not as good as 16 (now we're only issuing 2 mutually independent
2056 // vqrdmulh instructions, so we're probably paying for their high
2057 // latency).
2058 for (; i <= num_output_values - 8; i += 8)
2059 {
2060 int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2061 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2062 if (!shift_left)
2063 {
2064 // Fixed-point multiplication.
2065 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2066 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2067 // Rounding right shift.
2068 acc0 = RoundingDivideByPOT(acc0, -output_shift);
2069 acc1 = RoundingDivideByPOT(acc1, -output_shift);
2070 }
2071 else
2072 {
2073 // Fixed-point multiplication.
2074 acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2075 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2076
2077 acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2078 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2079 }
2080 // Add the output offset.
2081 acc0 = vaddq_s32(acc0, output_offset_vec);
2082 acc1 = vaddq_s32(acc1, output_offset_vec);
2083 // Apply the activation function.
2084 acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2085 acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2086 acc0 = vminq_s32(acc0, output_activation_max_vec);
2087 acc1 = vminq_s32(acc1, output_activation_max_vec);
2088 // Saturating cast to uint8_t and store to destination.
2089 const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2090 const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2091 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2092 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2093 vst1_u8(output_ptr, res_u8);
2094 output_ptr += 8;
2095 }
2096 // Handle 4 values at once. Now we're paying the full price of the
2097 // high latency of vqrdmulh. Also, storing only 4 bytes at the end
2098 // (without any alignment) can only be done 1 byte at a time.
2099 // Yet, that is still worth doing to minimize the amount of leftover
2100 // that will have to go through the very slow scalar code.
2101 for (; i <= num_output_values - 4; i += 4)
2102 {
2103 int32x4_t acc = vld1q_s32(acc_buffer + i);
2104 if (!shift_left)
2105 {
2106 // Fixed-point multiplication.
2107 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2108 // Rounding right shift.
2109 acc = RoundingDivideByPOT(acc, -output_shift);
2110 }
2111 else
2112 {
2113 // Fixed-point multiplication.
2114 acc = vmulq_n_s32(acc, multiplier_power_of_two);
2115 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2116 }
2117 // Add the output offset.
2118 acc = vaddq_s32(acc, output_offset_vec);
2119 // Apply the activation function.
2120 acc = vmaxq_s32(acc, output_activation_min_vec);
2121 acc = vminq_s32(acc, output_activation_max_vec);
2122 // Saturating cast to uint8_t and store to destination.
2123 const int16x4_t acc_s16 = vqmovn_s32(acc);
2124 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2125 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2126 vst1_lane_u8(output_ptr + 0, res_u8, 0);
2127 vst1_lane_u8(output_ptr + 1, res_u8, 1);
2128 vst1_lane_u8(output_ptr + 2, res_u8, 2);
2129 vst1_lane_u8(output_ptr + 3, res_u8, 3);
2130 output_ptr += 4;
2131 }
2132#endif // USE_NEON
2133
2134 // Handle leftover values, one by one. This is very slow.
2135 for (; i < num_output_values; i++)
2136 {
2137 int32_t acc = acc_buffer[i];
2138 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
2139 acc += output_offset;
2140 acc = std::max(acc, output_activation_min);
2141 acc = std::min(acc, output_activation_max);
2142 *output_ptr++ = static_cast<uint8_t>(acc);
2143 }
2144 }
2145 }
2146 output_ptr += batch_step;
2147 }
2148}
int32_t Dims(int i) const
Definition Shape.h:92
const luci_interpreter::RuntimeShape output_shape
int32_t RoundingDivideByPOT(int32_t x, int exponent)
void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)
PaddingValues padding_values
Definition Types.h:234

References nnfw::cker::DepthwiseConvParams::depth_multiplier, DepthwiseConvInitAccBuffer(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::Dims(), nnfw::cker::FlatSizeSkipDim(), nnfw::cker::PaddingValues::height, nnfw::cker::DepthwiseConvParams::input_offset, nnfw::cker::MatchingDim(), nnfw::cker::MultiplyByQuantizedMultiplier(), nnfw::cker::DepthwiseConvParams::output_multiplier, nnfw::cker::DepthwiseConvParams::output_offset, output_shape, nnfw::cker::DepthwiseConvParams::output_shift, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::DepthwiseConvParams::quantized_activation_max, nnfw::cker::DepthwiseConvParams::quantized_activation_min, QuantizedDepthwiseConvAccumRowGeneric(), nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, TFMINI_USE_DEPTHWISECONV_KERNEL, nnfw::cker::DepthwiseConvParams::weights_offset, and nnfw::cker::PaddingValues::width.

Referenced by nnfw::cker::optimized::DepthwiseConvWithRounding().

◆ DepthwiseConvInitAccBuffer()

void nnfw::cker::optimized::depthwise_conv::DepthwiseConvInitAccBuffer ( int  num_output_pixels,
int  output_depth,
const int32_t *  bias_data,
int32_t *  acc_buffer 
)
inline

Definition at line 1724 of file DepthwiseConvUint8.h.

1726{
1727 int i = 0;
1728#ifdef USE_NEON
1729 if (output_depth == 1)
1730 {
1731 const int32x4_t b = vdupq_n_s32(bias_data[0]);
1732 for (; i <= num_output_pixels - 16; i += 16)
1733 {
1734 vst1q_s32(acc_buffer + i + 0, b);
1735 vst1q_s32(acc_buffer + i + 4, b);
1736 vst1q_s32(acc_buffer + i + 8, b);
1737 vst1q_s32(acc_buffer + i + 12, b);
1738 }
1739 for (; i <= num_output_pixels - 4; i += 4)
1740 {
1741 vst1q_s32(acc_buffer + i, b);
1742 }
1743 }
1744 else if (output_depth == 2)
1745 {
1746 int32x4_t b = vdupq_n_s32(bias_data[0]);
1747 b = vsetq_lane_s32(bias_data[1], b, 1);
1748 b = vsetq_lane_s32(bias_data[1], b, 3);
1749 for (; i <= num_output_pixels - 8; i += 8)
1750 {
1751 vst1q_s32(acc_buffer + 2 * i + 0, b);
1752 vst1q_s32(acc_buffer + 2 * i + 4, b);
1753 vst1q_s32(acc_buffer + 2 * i + 8, b);
1754 vst1q_s32(acc_buffer + 2 * i + 12, b);
1755 }
1756 for (; i <= num_output_pixels - 2; i += 2)
1757 {
1758 vst1q_s32(acc_buffer + 2 * i, b);
1759 }
1760 }
1761 else if (output_depth == 4)
1762 {
1763 const int32x4_t b = vld1q_s32(bias_data);
1764 for (; i <= num_output_pixels - 4; i += 4)
1765 {
1766 vst1q_s32(acc_buffer + 4 * i + 0, b);
1767 vst1q_s32(acc_buffer + 4 * i + 4, b);
1768 vst1q_s32(acc_buffer + 4 * i + 8, b);
1769 vst1q_s32(acc_buffer + 4 * i + 12, b);
1770 }
1771 for (; i < num_output_pixels; i++)
1772 {
1773 vst1q_s32(acc_buffer + 4 * i, b);
1774 }
1775 }
1776 else if (output_depth == 8)
1777 {
1778 const int32x4_t b0 = vld1q_s32(bias_data);
1779 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1780 for (; i <= num_output_pixels - 2; i += 2)
1781 {
1782 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1783 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1784 vst1q_s32(acc_buffer + 8 * i + 8, b0);
1785 vst1q_s32(acc_buffer + 8 * i + 12, b1);
1786 }
1787 for (; i < num_output_pixels; i++)
1788 {
1789 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1790 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1791 }
1792 }
1793 else if (output_depth == 16)
1794 {
1795 const int32x4_t b0 = vld1q_s32(bias_data);
1796 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1797 const int32x4_t b2 = vld1q_s32(bias_data + 8);
1798 const int32x4_t b3 = vld1q_s32(bias_data + 12);
1799 for (; i < num_output_pixels; i++)
1800 {
1801 vst1q_s32(acc_buffer + 16 * i + 0, b0);
1802 vst1q_s32(acc_buffer + 16 * i + 4, b1);
1803 vst1q_s32(acc_buffer + 16 * i + 8, b2);
1804 vst1q_s32(acc_buffer + 16 * i + 12, b3);
1805 }
1806 }
1807#endif
1808 for (; i < num_output_pixels; i++)
1809 {
1810 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1811 }
1812}

Referenced by DepthwiseConvGeneral().

◆ QuantizedDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRow ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const uint8_t *  input_data,
int16_t  input_offset,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const uint8_t *  filter_data,
int16_t  filter_offset,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
int32_t *  acc_buffer 
)

Definition at line 1612 of file DepthwiseConvUint8.h.

1618{
1619 // Sanity check parameters. This is important in particular to ensure
1620 // that we keep the number of template instantiations minimal, so we don't
1621 // increase binary size unnecessarily.
1622 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1623 static_assert(kFixedInputDepth || kAllowStrided, "");
1624 assert(stride == 1 || kAllowStrided);
1625 if (kFixedInputDepth)
1626 {
1627 assert(input_depth == kFixedInputDepth);
1628 }
1629 if (kFixedDepthMultiplier)
1630 {
1631 assert(depth_multiplier == kFixedDepthMultiplier);
1632 }
1633 assert(output_depth == input_depth * depth_multiplier);
1634 const int input_ptr_increment = stride * input_depth;
1635 const uint8_t *filter_base_ptr = filter_data;
1636 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1637 {
1638 // For the current (filter_x, filter_y) point in the filter,
1639 // compute the boundaries of the corresponding output row segment.
1640 int out_x_loop_start_unclampled = 0;
1641 int out_x_loop_end_unclampled = 0;
1642 if (kAllowStrided)
1643 {
1644 if (stride == 2)
1645 {
1646 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 1) / 2;
1647 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1648 }
1649 else if (stride == 4)
1650 {
1651 out_x_loop_start_unclampled = (pad_width - dilation_factor * filter_x + 3) / 4;
1652 out_x_loop_end_unclampled = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1653 }
1654 else
1655 {
1656 out_x_loop_start_unclampled =
1657 (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1658 out_x_loop_end_unclampled =
1659 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1660 }
1661 }
1662 else
1663 {
1664 out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
1665 out_x_loop_end_unclampled = pad_width + input_width - dilation_factor * filter_x;
1666 }
1667 // The kernel will have to iterate on the segment of the
1668 // output row that starts at out_x_loop_start and out_x_loop_end.
1669 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclampled);
1670 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclampled);
1671
1672 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1673 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1674 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1675 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1676 QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
1677 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1678 input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
1679 filter_base_ptr += output_depth;
1680 }
1681}
list input_data
Definition infer.py:29

◆ QuantizedDepthwiseConvAccumRowGeneric()

void nnfw::cker::optimized::depthwise_conv::QuantizedDepthwiseConvAccumRowGeneric ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const uint8_t *  input_data,
int16_t  input_offset,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const uint8_t *  filter_data,
int16_t  filter_offset,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
int32_t *  acc_buffer 
)
inline

Definition at line 1684 of file DepthwiseConvUint8.h.

1691{
1692 const uint8_t *filter_base_ptr = filter_data;
1693 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1694 {
1695 const int out_x_loop_start =
1696 std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1697 const int out_x_loop_end =
1698 std::min(out_x_buffer_end,
1699 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1700
1701 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1702 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1703 const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
1704 const int input_ptr_increment = (stride - 1) * input_depth;
1705 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1706 {
1707 const uint8_t *filter_ptr = filter_base_ptr;
1708 for (int ic = 0; ic < input_depth; ++ic)
1709 {
1710 const int16_t input_val = *input_ptr++ + input_offset;
1711 for (int m = 0; m < depth_multiplier; m++)
1712 {
1713 const int16_t filter_val = *filter_ptr++ + filter_offset;
1714 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1715 }
1716 }
1717 input_ptr += input_ptr_increment;
1718 }
1719 filter_base_ptr += output_depth;
1720 }
1721}

References m.

Referenced by DepthwiseConvGeneral().