ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::depthwise_conv_op Namespace Reference

Namespaces

namespace  functor
 

Data Structures

struct  DepthwiseConv2DKernel
 
struct  LaunchDepthwiseConvBackpropFilterOp
 
struct  LaunchDepthwiseConvBackpropFilterOp< CPUDevice, T >
 
struct  LaunchDepthwiseConvBackpropInputOp
 
struct  LaunchDepthwiseConvBackpropInputOp< CPUDevice, T >
 
struct  LaunchDepthwiseConvOp
 
struct  LaunchDepthwiseConvOp< CPUDevice, T >
 

Typedefs

using CPUDevice = Eigen::ThreadPoolDevice
 

Functions

template<typename T >
void CopyOutputBackpropRegion (int, int, int, int, int filter_rows_, int filter_cols_, int, int stride_, int pad_rows_, int pad_cols_, int out_rows_, int out_cols_, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *out_backprop, T *buffer)
 
template<typename T >
void ComputeBackpropInput (int, int, int in_cols, int in_depth_, int filter_rows, int filter_cols, int depth_multiplier_, int, int, int, int, int, int out_depth_, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *filter, const T *buffer, T *out_buffer, T *output)
 
template<typename T >
void DepthwiseConvBackpropInputReference (int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *filter, T *in_backprop)
 
template<typename T >
void ComputeBackpropFilter (int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int out_rows, int out_cols, int out_depth_, const int64_t padded_out_depth_size, const int64_t out_r, const int64_t out_c, const T *out_backprop, const T *input_buffer, T *output_buffer)
 
template<typename T >
void DepthwiseConvBackpropFilterReference (int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *input, T *filter_backprop)
 

Typedef Documentation

◆ CPUDevice

using nnfw::cker::depthwise_conv_op::CPUDevice = typedef Eigen::ThreadPoolDevice

Definition at line 338 of file depthwise_conv_op.h.

Function Documentation

◆ ComputeBackpropFilter()

template<typename T >
void nnfw::cker::depthwise_conv_op::ComputeBackpropFilter ( int  ,
int  ,
int  ,
int  ,
int  filter_rows,
int  filter_cols,
int  ,
int  ,
int  ,
int  ,
int  out_rows,
int  out_cols,
int  out_depth_,
const int64_t  padded_out_depth_size,
const int64_t  out_r,
const int64_t  out_c,
const T *  out_backprop,
const T *  input_buffer,
T *  output_buffer 
)

Definition at line 898 of file depthwise_conv_op.h.

903{
904 typedef typename Eigen::internal::packet_traits<T>::type Packet;
905 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
906 // Calculate vectorized size of 'padded_out_depth_size'.
907 const int64_t out_depth = out_depth_;
908 const int64_t filter_spatial_size = filter_rows * filter_cols;
909 const int64_t output_vectorized_size = (padded_out_depth_size / kPacketSize) * kPacketSize;
910 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
911 // Determine whether we can execute fast or slow code path.
912 const int64_t output_image_size = out_rows * out_cols * out_depth;
913 const int64_t output_last_vector_index =
914 output_image_size - (filter_spatial_size * padded_out_depth_size);
915 const bool fast_path = base_output_index <= output_last_vector_index;
916
917 if (fast_path)
918 {
919 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
920 // amortize the cost of 'output_buffer' load store in the loop below.
921 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
922 {
923 // Load vector register from 'out_backprop'.
924 const auto out_bprop_block =
925 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
926 for (int j = 0; j < filter_spatial_size; ++j)
927 {
928 const int64_t index = i + j * padded_out_depth_size;
929 // Load vector register from 'input_buffer'.
930 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
931 // Load output block into vector register.
932 auto out_block_data = output_buffer + index;
933 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
934 // Vector multiply-add.
935 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
936 // Store 'out_block' back to memory.
937 Eigen::internal::pstoreu<T>(out_block_data, out_block);
938 }
939 }
940 }
941 else
942 {
943 // Slow path (cant do vector reads from non-padded 'out_backprop'.
944 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
945 {
946 // Calculate safe read size from 'out_backprop'.
947 const int64_t out_bprop_index = base_output_index + i;
948 const int64_t out_bprop_limit = std::min(output_image_size, out_bprop_index + kPacketSize);
949 T out_buf[kPacketSize];
950 memset(&out_buf, 0, kPacketSize * sizeof(T));
951 const int64_t scalar_size = out_bprop_limit - out_bprop_index;
952 for (int64_t j = 0; j < scalar_size; ++j)
953 {
954 out_buf[j] = out_backprop[out_bprop_index + j];
955 }
956 // Load vector register from 'out_buf'.
957 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
958 for (int j = 0; j < filter_spatial_size; ++j)
959 {
960 const int64_t index = i + j * padded_out_depth_size;
961 // Load vector register from 'input_buffer'.
962 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
963 // Load output block into vector register.
964 auto out_block_data = output_buffer + index;
965 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
966 // Vector multiply-add.
967 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
968 // Store 'out_block' back to memory.
969 Eigen::internal::pstoreu<T>(out_block_data, out_block);
970 }
971 }
972 }
973}
loco::GraphInputIndex index(const TFPlaceholder *node)
Definition TFNode.cpp:54

Referenced by nnfw::cker::depthwise_conv_op::LaunchDepthwiseConvBackpropFilterOp< CPUDevice, T >::operator()().

◆ ComputeBackpropInput()

template<typename T >
void nnfw::cker::depthwise_conv_op::ComputeBackpropInput ( int  ,
int  ,
int  in_cols,
int  in_depth_,
int  filter_rows,
int  filter_cols,
int  depth_multiplier_,
int  ,
int  ,
int  ,
int  ,
int  ,
int  out_depth_,
const int64_t  padded_filter_inner_dim_size,
const int64_t  in_r,
const int64_t  in_c,
const T *  filter,
const T *  buffer,
T *  out_buffer,
T *  output 
)

Definition at line 636 of file depthwise_conv_op.h.

641{
642 typedef typename Eigen::internal::packet_traits<T>::type Packet;
643 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
644
645 const int64_t in_depth = in_depth_;
646 const int64_t depth_multiplier = depth_multiplier_;
647 const int64_t out_depth = out_depth_;
648 const int64_t filter_spatial_size = filter_rows * filter_cols;
649
650 // Calculate vectorized and scalar lengths of 'out_depth'.
651 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
652 const int64_t output_scalar_size = out_depth % kPacketSize;
653
654 // Calculate base index at which to begin writing output.
655 const int64_t base_output_index = (in_r * in_cols + in_c) * in_depth;
656
657 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
658 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
659 const int64_t dm_vectorized_size = (depth_multiplier / kPacketSize) * kPacketSize;
660 const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
661
662 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
663 {
664 // Reset accumulator.
665 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
666 for (int j = 0; j < filter_spatial_size; ++j)
667 {
668 // Calculate index.
669 const int64_t index = i + j * padded_filter_inner_dim_size;
670 // Load filter.
671 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
672 // Load input.
673 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
674 // Vector multiply-add.
675 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
676 }
677 if (depth_multiplier == 1)
678 {
679 // Write directly to the output.
680 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
681 }
682 else
683 {
684 // Buffer output for subsequent reduction step.
685 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
686 }
687 }
688
689 if (output_scalar_size > 0)
690 {
691 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
692 for (int j = 0; j < filter_spatial_size; ++j)
693 {
694 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
695 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
696 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
697 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
698 }
699 // Load accumulator into an array and loop through output.
700 T out_buf[kPacketSize];
701 Eigen::internal::pstoreu<T>(out_buf, vaccum);
702 if (depth_multiplier == 1)
703 {
704 // Write directly to the output.
705 for (int j = 0; j < output_scalar_size; ++j)
706 {
707 output[base_output_index + output_vectorized_size + j] = out_buf[j];
708 }
709 }
710 else
711 {
712 // Buffer output for subsequent reduction step.
713 for (int j = 0; j < output_scalar_size; ++j)
714 {
715 out_buffer[output_vectorized_size + j] = out_buf[j];
716 }
717 }
718 }
719
720 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
721 if (depth_multiplier > 1)
722 {
723 for (int64_t d = 0; d < in_depth; ++d)
724 {
725 const int64_t index = d * depth_multiplier;
726 T accum = static_cast<T>(0);
727 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize)
728 {
729 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
730 accum += Eigen::internal::predux(v);
731 }
732 // Copy scalar portion of replicated output.
733 for (int64_t dm = 0; dm < dm_scalar_size; ++dm)
734 {
735 accum += out_buffer[index + dm_vectorized_size + dm];
736 }
737 // Copy to output.
738 output[base_output_index + d] = accum;
739 }
740 }
741}

◆ CopyOutputBackpropRegion()

template<typename T >
void nnfw::cker::depthwise_conv_op::CopyOutputBackpropRegion ( int  ,
int  ,
int  ,
int  ,
int  filter_rows_,
int  filter_cols_,
int  ,
int  stride_,
int  pad_rows_,
int  pad_cols_,
int  out_rows_,
int  out_cols_,
int  out_depth,
const int64_t  padded_filter_inner_dim_size,
const int64_t  in_r,
const int64_t  in_c,
const T *  out_backprop,
T *  buffer 
)

Definition at line 537 of file depthwise_conv_op.h.

542{
543 typedef typename Eigen::internal::packet_traits<T>::type Packet;
544 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
545
546 const int64_t stride = stride_;
547 const int64_t filter_rows = filter_rows_;
548 const int64_t filter_cols = filter_cols_;
549 const int64_t pad_rows = pad_rows_;
550 const int64_t pad_cols = pad_cols_;
551 const int64_t out_rows = out_rows_;
552 const int64_t out_cols = out_cols_;
553
554 // Calculate the output spatial region which used point (in_r, in_c) as input.
555 const int64_t out_r_start =
556 std::max(static_cast<int64_t>(0), (in_r - filter_rows + pad_rows + stride) / stride);
557 const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
558 const int64_t out_c_start =
559 std::max(static_cast<int64_t>(0), (in_c - filter_cols + pad_cols + stride) / stride);
560 const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
561
562 // Zero-pad 'buffer' if output region is smaller than filter spatial size.
563 const int64_t filter_spatial_size = filter_rows * filter_cols;
564 if ((out_r_end - out_r_start + 1) < filter_rows || (out_c_end - out_c_start + 1) < filter_cols)
565 {
566 memset(buffer, 0, filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
567 }
568
569 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
570 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
571 const int64_t scalar_size = out_depth % kPacketSize;
572 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
573
574 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r)
575 {
576 const int64_t f_r = in_r + pad_rows - out_r * stride;
577 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c)
578 {
579 const int64_t f_c = in_c + pad_cols - out_c * stride;
580 const int64_t buf_base = (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
581 // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
582 auto *out_bprop = out_backprop + (out_r * out_cols + out_c) * out_depth;
583
584 // Copy vectorized portion of inner dimension into 'buffer'.
585 for (int64_t d = 0; d < vectorized_size; d += kPacketSize)
586 {
587 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
588 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
589 }
590 // Copy scalar portion of out_bprop to 'buffer'
591 for (int64_t d = 0; d < scalar_size; ++d)
592 {
593 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
594 }
595 // Pad to vector-register width (if needed).
596 for (int64_t d = 0; d < pad_size; ++d)
597 {
598 buffer[buf_base + vectorized_size + scalar_size + d] = static_cast<T>(0);
599 }
600 }
601 }
602}

◆ DepthwiseConvBackpropFilterReference()

template<typename T >
void nnfw::cker::depthwise_conv_op::DepthwiseConvBackpropFilterReference ( int  batch,
int  in_rows,
int  in_cols,
int  in_depth,
int  out_rows,
int  out_cols,
int  out_depth,
int  stride,
int  depth_multiplier,
int  filter_rows,
int  filter_cols,
int  pad_rows,
int  pad_cols,
const T *  out_backprop,
const T *  input,
T *  filter_backprop 
)

Definition at line 1085 of file depthwise_conv_op.h.

1090{
1091 int num_filter_backprop = filter_rows * filter_cols * in_depth * depth_multiplier;
1092 memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
1093 // Naive for loop as a reference point without concerns about performance.
1094 for (int b = 0; b < batch; ++b)
1095 {
1096 for (int out_r = 0; out_r < out_rows; ++out_r)
1097 {
1098 for (int out_c = 0; out_c < out_cols; ++out_c)
1099 {
1100 for (int out_d = 0; out_d < out_depth; ++out_d)
1101 {
1102 const int in_d = out_d / depth_multiplier;
1103 const int dm = out_d % depth_multiplier;
1104 const int in_r_start = out_r * stride - pad_rows;
1105 const int in_c_start = out_c * stride - pad_cols;
1106
1107 for (int f_r = 0; f_r < filter_rows; ++f_r)
1108 {
1109 for (int f_c = 0; f_c < filter_cols; ++f_c)
1110 {
1111 const int in_r = in_r_start + f_r;
1112 const int in_c = in_c_start + f_c;
1113
1114 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
1115 {
1116 int out_backprop_offset =
1117 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
1118 int input_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
1119 int filter_backprop_offset =
1120 dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
1121 filter_backprop[filter_backprop_offset] +=
1122 input[input_offset] * out_backprop[out_backprop_offset];
1123 }
1124 }
1125 }
1126 }
1127 }
1128 }
1129 }
1130}

◆ DepthwiseConvBackpropInputReference()

template<typename T >
void nnfw::cker::depthwise_conv_op::DepthwiseConvBackpropInputReference ( int  batch,
int  in_rows,
int  in_cols,
int  in_depth,
int  out_rows,
int  out_cols,
int  out_depth,
int  stride,
int  depth_multiplier,
int  filter_rows,
int  filter_cols,
int  pad_rows,
int  pad_cols,
const T *  out_backprop,
const T *  filter,
T *  in_backprop 
)

Definition at line 820 of file depthwise_conv_op.h.

825{
826 // Naive for loop as a reference point without concerns about performance.
827 for (int b = 0; b < batch; ++b)
828 {
829 for (int in_r = 0; in_r < in_rows; ++in_r)
830 {
831 for (int in_c = 0; in_c < in_cols; ++in_c)
832 {
833 for (int in_d = 0; in_d < in_depth; ++in_d)
834 {
835 T sum = 0;
836 const int out_d_start = in_d * depth_multiplier;
837 const int out_d_end = out_d_start + depth_multiplier;
838
839 for (int out_d = out_d_start; out_d < out_d_end; ++out_d)
840 {
841 const int out_r_start = std::max(0, (in_r - filter_rows + pad_rows + stride) / stride);
842 const int out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
843
844 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r)
845 {
846 const int out_c_start =
847 std::max(0, (in_c - filter_cols + pad_cols + stride) / stride);
848 const int out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
849
850 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c)
851 {
852 int f_r = in_r + pad_rows - out_r * stride;
853 int f_c = in_c + pad_cols - out_c * stride;
854 int filter_dm = out_d - out_d_start;
855 int out_backprop_offset =
856 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
857 int filter_offset =
858 filter_dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
859 sum += out_backprop[out_backprop_offset] * filter[filter_offset];
860 }
861 }
862 }
863
864 int in_backprop_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
865 in_backprop[in_backprop_offset] = sum;
866 }
867 }
868 }
869 }
870}