1820{
1821 (void)bias_shape;
1838 const int input_height = input_shape.
Dims(1);
1839 const int input_width = input_shape.
Dims(2);
1840 const int input_depth = input_shape.
Dims(3);
1841 const int filter_height = filter_shape.
Dims(1);
1842 const int filter_width = filter_shape.
Dims(2);
1845#ifdef USE_NEON
1846 const bool shift_left = (output_shift > 0);
1847 const int32_t multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
1848#endif
1849
1850 static const int kAccBufferMaxSize = 2048;
1851 int32_t acc_buffer[kAccBufferMaxSize];
1852 assert(kAccBufferMaxSize >= output_depth);
1853 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1854 [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1855 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1856 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1857 assert(kOutputPixelsInAccBuffer >= 1);
1858 assert(thread_dim == 0 || thread_dim == 1);
1859
1860
1861
1863 row_accum_func_t row_accum_func = nullptr;
1864
1865#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1866 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1867 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1868 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1869 { \
1870 row_accum_func = \
1871 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1872 }
1873
1874#ifdef USE_NEON
1875
1876
1877
1878
1879
1890
1891
1892
1893
1903
1904
1905
1906
1910#endif
1911
1912
1913 if (!row_accum_func)
1914 {
1916 }
1917
1918#undef TFMINI_USE_DEPTHWISECONV_KERNEL
1919
1920 const int input_height_stride = input_shape.
Dims(3) * input_shape.
Dims(2);
1921 const int input_batch_stride = input_height_stride * input_shape.
Dims(1);
1922 const int filter_height_stride = filter_shape.
Dims(3) * filter_shape.
Dims(2);
1923
1924
1925 int batch_start = 0;
1926 int batch_end = batches;
1927 int row_start = 0;
1928 int row_end = output_height;
1929 int output_ptr_offset = 0;
1930
1931 switch (thread_dim)
1932 {
1933 case 0:
1934
1935 assert(thread_start >= 0);
1936 assert(thread_end <= batches);
1937 batch_start = thread_start;
1938 batch_end = thread_end;
1940 break;
1941 case 1:
1942
1943 assert(thread_start >= 0);
1944 assert(thread_end <= output_height);
1945 row_start = thread_start;
1946 row_end = thread_end;
1947 output_ptr_offset = row_start * output_width * output_depth;
1948 break;
1949 }
1950
1951 uint8_t *output_ptr = output_data + output_ptr_offset;
1952 int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
1953 for (int b = batch_start; b < batch_end; ++b)
1954 {
1955 for (int out_y = row_start; out_y < row_end; ++out_y)
1956 {
1957 const int in_y_origin = (out_y * stride_height) - pad_height;
1958 const int filter_y_start =
1959 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1960 const int filter_y_end =
1961 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1962 dilation_height_factor);
1963 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1964 out_x_buffer_start += kOutputPixelsInAccBuffer)
1965 {
1966 const int out_x_buffer_end =
1967 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1968
1969
1970
1971 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1972
1973
1974 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1975
1976 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1977 {
1978 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1979 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1980 input_data + in_y * input_height_stride + b * input_batch_stride,
1981 input_offset, pad_width, depth_multiplier, filter_width,
1982 filter_data + filter_y * filter_height_stride, filter_offset,
1983 out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
1984 }
1985
1986
1987 const int num_output_values = output_depth * num_output_pixels;
1988 int i = 0;
1989#ifdef USE_NEON
1990 using gemmlowp::RoundingDivideByPOT;
1991 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
1992 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_activation_min);
1993 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_activation_max);
1994
1995
1996
1997
1998 for (; i <= num_output_values - 16; i += 16)
1999 {
2000 int32x4_t acc[4];
2001 for (int j = 0; j < 4; j++)
2002 {
2003 acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
2004 }
2005
2006 if (!shift_left)
2007 {
2008
2009 for (int j = 0; j < 4; j++)
2010 {
2011 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2012 }
2013 for (int j = 0; j < 4; j++)
2014 {
2016 }
2017 }
2018 else
2019 {
2020
2021 for (int j = 0; j < 4; j++)
2022 {
2023 acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
2024 acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
2025 }
2026 }
2027
2028 for (int j = 0; j < 4; j++)
2029 {
2030 acc[j] = vaddq_s32(acc[j], output_offset_vec);
2031 }
2032
2033 for (int j = 0; j < 4; j++)
2034 {
2035 acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
2036 }
2037 for (int j = 0; j < 4; j++)
2038 {
2039 acc[j] = vminq_s32(acc[j], output_activation_max_vec);
2040 }
2041
2042 int16x4_t acc_s16[4];
2043 for (int j = 0; j < 4; j++)
2044 {
2045 acc_s16[j] = vqmovn_s32(acc[j]);
2046 }
2047 const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
2048 const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
2049 const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
2050 const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
2051 vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
2052 output_ptr += 16;
2053 }
2054
2055
2056
2057
2058 for (; i <= num_output_values - 8; i += 8)
2059 {
2060 int32x4_t acc0 = vld1q_s32(acc_buffer + i);
2061 int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
2062 if (!shift_left)
2063 {
2064
2065 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2066 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2067
2070 }
2071 else
2072 {
2073
2074 acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
2075 acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
2076
2077 acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
2078 acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
2079 }
2080
2081 acc0 = vaddq_s32(acc0, output_offset_vec);
2082 acc1 = vaddq_s32(acc1, output_offset_vec);
2083
2084 acc0 = vmaxq_s32(acc0, output_activation_min_vec);
2085 acc1 = vmaxq_s32(acc1, output_activation_min_vec);
2086 acc0 = vminq_s32(acc0, output_activation_max_vec);
2087 acc1 = vminq_s32(acc1, output_activation_max_vec);
2088
2089 const int16x4_t acc0_s16 = vqmovn_s32(acc0);
2090 const int16x4_t acc1_s16 = vqmovn_s32(acc1);
2091 const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
2092 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2093 vst1_u8(output_ptr, res_u8);
2094 output_ptr += 8;
2095 }
2096
2097
2098
2099
2100
2101 for (; i <= num_output_values - 4; i += 4)
2102 {
2103 int32x4_t acc = vld1q_s32(acc_buffer + i);
2104 if (!shift_left)
2105 {
2106
2107 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2108
2110 }
2111 else
2112 {
2113
2114 acc = vmulq_n_s32(acc, multiplier_power_of_two);
2115 acc = vqrdmulhq_n_s32(acc, output_multiplier);
2116 }
2117
2118 acc = vaddq_s32(acc, output_offset_vec);
2119
2120 acc = vmaxq_s32(acc, output_activation_min_vec);
2121 acc = vminq_s32(acc, output_activation_max_vec);
2122
2123 const int16x4_t acc_s16 = vqmovn_s32(acc);
2124 const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
2125 const uint8x8_t res_u8 = vqmovun_s16(res_s16);
2126 vst1_lane_u8(output_ptr + 0, res_u8, 0);
2127 vst1_lane_u8(output_ptr + 1, res_u8, 1);
2128 vst1_lane_u8(output_ptr + 2, res_u8, 2);
2129 vst1_lane_u8(output_ptr + 3, res_u8, 3);
2130 output_ptr += 4;
2131 }
2132#endif
2133
2134
2135 for (; i < num_output_values; i++)
2136 {
2137 int32_t acc = acc_buffer[i];
2139 acc += output_offset;
2140 acc = std::max(acc, output_activation_min);
2141 acc = std::min(acc, output_activation_max);
2142 *output_ptr++ = static_cast<uint8_t>(acc);
2143 }
2144 }
2145 }
2146 output_ptr += batch_step;
2147 }
2148}
int32_t Dims(int i) const
const luci_interpreter::RuntimeShape output_shape
int32_t RoundingDivideByPOT(int32_t x, int exponent)
void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, int input_width, const uint8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const uint8_t *filter_data, int16_t filter_offset, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)
int16_t dilation_height_factor
int32_t output_multiplier
int16_t dilation_width_factor
int32_t quantized_activation_max
int32_t quantized_activation_min
PaddingValues padding_values