ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::optimized_integer_ops::depthwise_conv Namespace Reference

Data Structures

struct  QuantizedDepthwiseConvKernel
 

Functions

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void QuantizedDepthwiseConvAccumRow (int stride, int dilation_factor, int input_depth, int input_width, const int8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const int8_t *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
 
void QuantizedDepthwiseConvAccumRowGeneric (int stride, int dilation_factor, int input_depth, int input_width, const int8_t *input_data, int16_t input_offset, int pad_width, int depth_multiplier, int filter_width, const int8_t *filter_data, int out_x_buffer_start, int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
 
void DepthwiseConvInitAccBuffer (int num_output_pixels, int output_depth, const int32_t *bias_data, int32_t *acc_buffer)
 
void DepthwiseConvGeneral (const DepthwiseConvParams &params, const int32_t *output_multiplier, const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const int32_t *bias_data, const Shape &output_shape, int8_t *output_data, int thread_start, int thread_end, int thread_dim)
 

Function Documentation

◆ DepthwiseConvGeneral()

void nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvGeneral ( const DepthwiseConvParams params,
const int32_t *  output_multiplier,
const int32_t *  output_shift,
const Shape input_shape,
const int8_t *  input_data,
const Shape filter_shape,
const int8_t *  filter_data,
const Shape ,
const int32_t *  bias_data,
const Shape output_shape,
int8_t *  output_data,
int  thread_start,
int  thread_end,
int  thread_dim 
)
inline

Definition at line 1739 of file DepthwiseConvInt8.h.

1746{
1747 const int stride_width = params.stride_width;
1748 const int stride_height = params.stride_height;
1749 const int pad_width = params.padding_values.width;
1750 const int pad_height = params.padding_values.height;
1751 const int depth_multiplier = params.depth_multiplier;
1752 const int32_t output_activation_min = params.quantized_activation_min;
1753 const int32_t output_activation_max = params.quantized_activation_max;
1754 const int32_t input_offset = params.input_offset;
1755 const int32_t output_offset = params.output_offset;
1756 const int dilation_width_factor = params.dilation_width_factor;
1757 const int dilation_height_factor = params.dilation_height_factor;
1758 const int batches = MatchingDim(input_shape, 0, output_shape, 0);
1759 const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
1760 const int input_height = input_shape.Dims(1);
1761 const int input_width = input_shape.Dims(2);
1762 const int input_depth = input_shape.Dims(3);
1763 const int filter_height = filter_shape.Dims(1);
1764 const int filter_width = filter_shape.Dims(2);
1765 const int output_rows = output_shape.Dims(1);
1766 const int output_width = output_shape.Dims(2);
1767
1768 static const int kAccBufferMaxSize = 2048;
1769 int32_t acc_buffer[kAccBufferMaxSize];
1770 assert(kAccBufferMaxSize >= output_depth);
1771 const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
1772 [[maybe_unused]] const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
1773 assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
1774 assert(kAccBufferActualSize <= kAccBufferMaxSize);
1775 assert(kOutputPixelsInAccBuffer >= 1);
1776 assert(thread_dim == 0 || thread_dim == 1);
1777
1778 // row_accum_func will point to the core accumulation function to be used
1779 // for this DepthwiseConv op.
1780 using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
1781 row_accum_func_t row_accum_func = nullptr;
1782
1783#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
1784 if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
1785 (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
1786 depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
1787 { \
1788 row_accum_func = \
1789 QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
1790 }
1791
1792#ifdef USE_NEON
1793 // We go over our list of kernels by decreasing order of preference
1794 // for the cases where multiple kernels could apply.
1795
1796 // Start with the fastest kernels: AllowStrided=false, fixed input depth.
1797
1808
1809 // Next come the strided kernels: AllowStrided=true, fixed input depth.
1810 // They are a bit less efficient, but allow stride!=1.
1811
1821
1822 // Finally, the kernels allowing a variable input depth,
1823 // these are the least efficient but most general kernels.
1824
1828#endif // USE_NEON
1829
1830 // No matching fast kernel found, use slow fallback.
1831 if (!row_accum_func)
1832 {
1833 row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
1834 }
1835
1836#undef TFMINI_USE_DEPTHWISECONV_KERNEL
1837
1838 const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
1839 const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1840 const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1841
1842 // Now that we have determined row_accum_func, we can start work.
1843 int batch_start = 0;
1844 int batch_end = batches;
1845 int row_start = 0;
1846 int row_end = output_rows;
1847 int output_ptr_offset = 0;
1848
1849 switch (thread_dim)
1850 {
1851 case 0:
1852 assert(thread_start >= 0);
1853 assert(thread_end <= batches);
1854 batch_start = thread_start;
1855 batch_end = thread_end;
1856 output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1857 break;
1858 case 1:
1859 assert(thread_start >= 0);
1860 assert(thread_end <= output_rows);
1861 row_start = thread_start;
1862 row_end = thread_end;
1863 output_ptr_offset = row_start * output_width * output_depth;
1864 break;
1865 }
1866
1867 int8_t *output_ptr = output_data + output_ptr_offset;
1868 int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
1869 for (int b = batch_start; b < batch_end; ++b)
1870 {
1871 for (int out_y = row_start; out_y < row_end; ++out_y)
1872 {
1873 const int in_y_origin = (out_y * stride_height) - pad_height;
1874 const int filter_y_start =
1875 std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
1876 const int filter_y_end =
1877 std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
1878 dilation_height_factor);
1879 for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1880 out_x_buffer_start += kOutputPixelsInAccBuffer)
1881 {
1882 const int out_x_buffer_end =
1883 std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1884 // We call a 'pixel' a group of activation that share all but the
1885 // 'depth'/'channel' coordinate. num_output_pixels is the number of
1886 // output pixels that we will accumulate in this loop iteration.
1887 const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1888 // Initialize our local accumulator with the bias values, so we don't
1889 // have to add them later.
1890 DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
1891 // Accumulation loop. Most of the time should be spent in here.
1892 for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
1893 {
1894 const int in_y = in_y_origin + dilation_height_factor * filter_y;
1895 row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
1896 input_data + in_y * input_height_stride + b * input_batch_stride,
1897 input_offset, pad_width, depth_multiplier, filter_width,
1898 filter_data + filter_y * filter_height_stride, out_x_buffer_start,
1899 out_x_buffer_end, output_depth, acc_buffer);
1900 }
1901 // Finished accumulating int32_t values. Now need to convert them to
1902 // the final 8bit form and store them.
1903 const int num_output_values = output_depth * num_output_pixels;
1904
1905 Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
1906 output_activation_min, output_activation_max, acc_buffer, output_ptr);
1907
1908 output_ptr += num_output_values;
1909 }
1910 }
1911 output_ptr += batch_step;
1912 }
1913}
#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER)
int32_t Dims(int i) const
Definition Shape.h:92
const luci_interpreter::RuntimeShape output_shape
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253
PaddingValues padding_values
Definition Types.h:234

References nnfw::cker::DepthwiseConvParams::depth_multiplier, DepthwiseConvInitAccBuffer(), nnfw::cker::DepthwiseConvParams::dilation_height_factor, nnfw::cker::DepthwiseConvParams::dilation_width_factor, nnfw::cker::Shape::Dims(), nnfw::cker::FlatSizeSkipDim(), nnfw::cker::PaddingValues::height, nnfw::cker::DepthwiseConvParams::input_offset, nnfw::cker::MatchingDim(), nnfw::cker::DepthwiseConvParams::output_offset, output_shape, nnfw::cker::DepthwiseConvParams::padding_values, nnfw::cker::Quantize(), nnfw::cker::DepthwiseConvParams::quantized_activation_max, nnfw::cker::DepthwiseConvParams::quantized_activation_min, QuantizedDepthwiseConvAccumRowGeneric(), nnfw::cker::DepthwiseConvParams::stride_height, nnfw::cker::DepthwiseConvParams::stride_width, TFMINI_USE_DEPTHWISECONV_KERNEL, and nnfw::cker::PaddingValues::width.

Referenced by nnfw::cker::optimized_integer_ops::DepthwiseConvWithRounding().

◆ DepthwiseConvInitAccBuffer()

void nnfw::cker::optimized_integer_ops::depthwise_conv::DepthwiseConvInitAccBuffer ( int  num_output_pixels,
int  output_depth,
const int32_t *  bias_data,
int32_t *  acc_buffer 
)
inline

Definition at line 1649 of file DepthwiseConvInt8.h.

1651{
1652 int i = 0;
1653#ifdef USE_NEON
1654 if (output_depth == 1)
1655 {
1656 const int32x4_t b = vdupq_n_s32(bias_data[0]);
1657 for (; i <= num_output_pixels - 16; i += 16)
1658 {
1659 vst1q_s32(acc_buffer + i + 0, b);
1660 vst1q_s32(acc_buffer + i + 4, b);
1661 vst1q_s32(acc_buffer + i + 8, b);
1662 vst1q_s32(acc_buffer + i + 12, b);
1663 }
1664 for (; i <= num_output_pixels - 4; i += 4)
1665 {
1666 vst1q_s32(acc_buffer + i, b);
1667 }
1668 }
1669 else if (output_depth == 2)
1670 {
1671 int32x4_t b = vdupq_n_s32(bias_data[0]);
1672 b = vsetq_lane_s32(bias_data[1], b, 1);
1673 b = vsetq_lane_s32(bias_data[1], b, 3);
1674 for (; i <= num_output_pixels - 8; i += 8)
1675 {
1676 vst1q_s32(acc_buffer + 2 * i + 0, b);
1677 vst1q_s32(acc_buffer + 2 * i + 4, b);
1678 vst1q_s32(acc_buffer + 2 * i + 8, b);
1679 vst1q_s32(acc_buffer + 2 * i + 12, b);
1680 }
1681 for (; i <= num_output_pixels - 2; i += 2)
1682 {
1683 vst1q_s32(acc_buffer + 2 * i, b);
1684 }
1685 }
1686 else if (output_depth == 4)
1687 {
1688 const int32x4_t b = vld1q_s32(bias_data);
1689 for (; i <= num_output_pixels - 4; i += 4)
1690 {
1691 vst1q_s32(acc_buffer + 4 * i + 0, b);
1692 vst1q_s32(acc_buffer + 4 * i + 4, b);
1693 vst1q_s32(acc_buffer + 4 * i + 8, b);
1694 vst1q_s32(acc_buffer + 4 * i + 12, b);
1695 }
1696 for (; i < num_output_pixels; i++)
1697 {
1698 vst1q_s32(acc_buffer + 4 * i, b);
1699 }
1700 }
1701 else if (output_depth == 8)
1702 {
1703 const int32x4_t b0 = vld1q_s32(bias_data);
1704 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1705 for (; i <= num_output_pixels - 2; i += 2)
1706 {
1707 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1708 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1709 vst1q_s32(acc_buffer + 8 * i + 8, b0);
1710 vst1q_s32(acc_buffer + 8 * i + 12, b1);
1711 }
1712 for (; i < num_output_pixels; i++)
1713 {
1714 vst1q_s32(acc_buffer + 8 * i + 0, b0);
1715 vst1q_s32(acc_buffer + 8 * i + 4, b1);
1716 }
1717 }
1718 else if (output_depth == 16)
1719 {
1720 const int32x4_t b0 = vld1q_s32(bias_data);
1721 const int32x4_t b1 = vld1q_s32(bias_data + 4);
1722 const int32x4_t b2 = vld1q_s32(bias_data + 8);
1723 const int32x4_t b3 = vld1q_s32(bias_data + 12);
1724 for (; i < num_output_pixels; i++)
1725 {
1726 vst1q_s32(acc_buffer + 16 * i + 0, b0);
1727 vst1q_s32(acc_buffer + 16 * i + 4, b1);
1728 vst1q_s32(acc_buffer + 16 * i + 8, b2);
1729 vst1q_s32(acc_buffer + 16 * i + 12, b3);
1730 }
1731 }
1732#endif
1733 for (; i < num_output_pixels; i++)
1734 {
1735 memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
1736 }
1737}

Referenced by DepthwiseConvGeneral().

◆ QuantizedDepthwiseConvAccumRow()

template<bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
void nnfw::cker::optimized_integer_ops::depthwise_conv::QuantizedDepthwiseConvAccumRow ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const int8_t *  input_data,
int16_t  input_offset,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const int8_t *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
int32_t *  acc_buffer 
)

Definition at line 1539 of file DepthwiseConvInt8.h.

1544{
1545 // Consistency check parameters. This is important in particular to ensure
1546 // that we keep the number of template instantiations minimal, so we don't
1547 // increase binary size unnecessarily.
1548 static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
1549 static_assert(kFixedInputDepth || kAllowStrided, "");
1550 assert(stride == 1 || kAllowStrided);
1551 if (kFixedInputDepth)
1552 {
1553 assert(input_depth == kFixedInputDepth);
1554 }
1555 if (kFixedDepthMultiplier)
1556 {
1557 assert(depth_multiplier == kFixedDepthMultiplier);
1558 }
1559 assert(output_depth == input_depth * depth_multiplier);
1560 const int input_ptr_increment = stride * input_depth;
1561 const int8_t *filter_base_ptr = filter_data;
1562 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1563 {
1564 // For the current (filter_x, filter_y) point in the filter,
1565 // compute the boundaries of the corresponding output row segment.
1566 int out_x_loop_start_unclamped = 0;
1567 int out_x_loop_end_unclamped = 0;
1568 if (kAllowStrided)
1569 {
1570 if (stride == 2)
1571 {
1572 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
1573 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
1574 }
1575 else if (stride == 4)
1576 {
1577 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
1578 out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
1579 }
1580 else
1581 {
1582 out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
1583 out_x_loop_end_unclamped =
1584 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
1585 }
1586 }
1587 else
1588 {
1589 out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
1590 out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
1591 }
1592 // The kernel will have to iterate on the segment of the
1593 // output row that starts at out_x_loop_start and out_x_loop_end.
1594 const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
1595 const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
1596
1597 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1598 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1599 const int8_t *input_ptr = input_data + in_x_origin * input_depth;
1600 const int num_output_pixels = out_x_loop_end - out_x_loop_start;
1601 QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
1602 num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
1603 input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
1604 filter_base_ptr += output_depth;
1605 }
1606}
list input_data
Definition infer.py:29

◆ QuantizedDepthwiseConvAccumRowGeneric()

void nnfw::cker::optimized_integer_ops::depthwise_conv::QuantizedDepthwiseConvAccumRowGeneric ( int  stride,
int  dilation_factor,
int  input_depth,
int  input_width,
const int8_t *  input_data,
int16_t  input_offset,
int  pad_width,
int  depth_multiplier,
int  filter_width,
const int8_t *  filter_data,
int  out_x_buffer_start,
int  out_x_buffer_end,
int  output_depth,
int32_t *  acc_buffer 
)
inline

Definition at line 1609 of file DepthwiseConvInt8.h.

1616{
1617 const int8_t *filter_base_ptr = filter_data;
1618 for (int filter_x = 0; filter_x < filter_width; ++filter_x)
1619 {
1620 const int out_x_loop_start =
1621 std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
1622 const int out_x_loop_end =
1623 std::min(out_x_buffer_end,
1624 (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
1625
1626 int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
1627 const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
1628 const int8_t *input_ptr = input_data + in_x_origin * input_depth;
1629 const int input_ptr_increment = (stride - 1) * input_depth;
1630 for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
1631 {
1632 const int8_t *filter_ptr = filter_base_ptr;
1633 for (int ic = 0; ic < input_depth; ++ic)
1634 {
1635 const int16_t input_val = *input_ptr++ + input_offset;
1636 for (int m = 0; m < depth_multiplier; m++)
1637 {
1638 const int16_t filter_val = *filter_ptr++;
1639 *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
1640 }
1641 }
1642 input_ptr += input_ptr_increment;
1643 }
1644 filter_base_ptr += output_depth;
1645 }
1646}

References m.

Referenced by DepthwiseConvGeneral().