18#ifndef __NNFW_CKER_EIGEN_DEPTHWISE_CONV_OP_H__
19#define __NNFW_CKER_EIGEN_DEPTHWISE_CONV_OP_H__
22#define EIGEN_USE_THREADS
25#include "unsupported/Eigen/CXX11/Tensor"
33namespace depthwise_conv_op
38 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
39 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
40 int out_rows,
int out_cols,
int out_depth,
const T *input,
41 const T *depthwise_filter, T *padded_filter_data,
bool pad_filter, T *in_buf,
47 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
48 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
49 int out_rows,
int out_cols,
int out_depth,
const T *out_backprop,
const T *filter,
55 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
56 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
57 int out_rows,
int out_cols,
int out_depth,
const T *out_backprop,
const T *input,
84 void operator()(
int,
int,
int,
int,
int filter_rows,
int filter_cols,
int,
int,
int,
int,
int,
85 int,
int out_depth,
const T *filter, T *padded_filter)
87 typedef typename Eigen::internal::packet_traits<T>::type Packet;
88 static const int64_t kPacketSize = (
sizeof(Packet) /
sizeof(T));
91 const int64_t filter_inner_dim_size = out_depth;
92 const int64_t vectorized_size = (filter_inner_dim_size / kPacketSize) * kPacketSize;
93 const int64_t scalar_size = filter_inner_dim_size - vectorized_size;
95 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
96 const int64_t padded_filter_stride = vectorized_size + kPacketSize;
98 const int64_t filter_spatial_size = filter_rows * filter_cols;
99 for (int64_t i = 0; i < filter_spatial_size; ++i)
101 const int64_t input_base = i * filter_inner_dim_size;
102 const int64_t output_base = i * padded_filter_stride;
104 for (int64_t j = 0; j < vectorized_size; j += kPacketSize)
106 const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j);
107 Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v);
110 for (int64_t j = 0; j < scalar_size; ++j)
112 padded_filter[output_base + vectorized_size + j] = filter[input_base + vectorized_size + j];
115 for (int64_t j = 0; j < pad_size; ++j)
117 padded_filter[output_base + vectorized_size + scalar_size + j] =
static_cast<T
>(0);
147 void operator()(
int,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
int filter_cols,
148 int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
int,
int,
149 int out_depth,
const int64_t padded_filter_inner_dim_size,
const int64_t out_r,
150 const int64_t out_c,
const T *input, T *input_buffer)
152 typedef typename Eigen::internal::packet_traits<T>::type Packet;
153 static const int64_t kPacketSize = Eigen::internal::packet_traits<T>::size;
155 const int64_t kDepth = depth_multiplier;
157 const int64_t input_vectorized_size = (in_depth / kPacketSize) * kPacketSize;
158 const int64_t input_scalar_size = in_depth - input_vectorized_size;
161 const int64_t output_scalar_size = out_depth % kPacketSize;
162 const int64_t output_pad_size = output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0;
167 auto *in_buf = input_buffer;
168 const int64_t in_r_start = out_r * stride - pad_rows;
169 const int64_t in_c_start = out_c * stride - pad_cols;
172 if (kDepth > 1 && kDepth <= kPacketSize)
174 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
176 const int64_t in_r = in_r_start + f_r;
178 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
180 const int64_t in_c = in_c_start + f_c;
182 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
184 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
185 int64_t limit = in_depth;
189 if (f_c == filter_cols - 1)
191 limit -= (kPacketSize - kDepth) / kDepth + 1;
198 for (int64_t d = 0; d < limit; d++)
200 const auto p = Eigen::internal::pset1<Packet>(in[d]);
201 Eigen::internal::pstoreu<T>(in_buf, p);
206 for (int64_t d = limit; d < in_depth; d++)
208 const auto value = in[d];
209 for (int64_t dm = 0; dm < kDepth; dm++)
217 for (int64_t d = 0; d < output_pad_size; ++d)
219 in_buf[d] =
static_cast<T
>(0);
221 in_buf += output_pad_size;
226 memset(in_buf, 0,
sizeof(T) * padded_filter_inner_dim_size);
227 in_buf += padded_filter_inner_dim_size;
232 else if (kDepth > kPacketSize)
237 const int64_t dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize;
239 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
241 const int64_t in_r = in_r_start + f_r;
243 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
245 const int64_t in_c = in_c_start + f_c;
247 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
249 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
251 for (int64_t d = 0; d < in_depth; d++)
253 const auto p = Eigen::internal::pset1<Packet>(in[d]);
254 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize)
256 Eigen::internal::pstoreu<T>(in_buf + dm, p);
259 Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p);
263 for (int64_t d = 0; d < output_pad_size; ++d)
265 in_buf[d] =
static_cast<T
>(0);
267 in_buf += output_pad_size;
272 memset(in_buf, 0,
sizeof(T) * padded_filter_inner_dim_size);
273 in_buf += padded_filter_inner_dim_size;
278 else if (kDepth == 1)
280 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
282 const int64_t in_r = in_r_start + f_r;
284 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
286 const int64_t in_c = in_c_start + f_c;
288 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
290 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
291 for (int64_t d = 0; d < input_vectorized_size; d += kPacketSize)
293 const auto p = Eigen::internal::ploadu<Packet>(in + d);
294 Eigen::internal::pstoreu<T>(in_buf, p);
295 in_buf += kPacketSize;
297 for (int64_t d = 0; d < input_scalar_size; ++d)
299 T v = in[input_vectorized_size + d];
302 in_buf += input_scalar_size;
305 for (int64_t d = 0; d < output_pad_size; ++d)
307 in_buf[d] =
static_cast<T
>(0);
309 in_buf += output_pad_size;
314 memset(in_buf, 0,
sizeof(T) * padded_filter_inner_dim_size);
315 in_buf += padded_filter_inner_dim_size;
334namespace depthwise_conv_op
364 static void Run(
int filter_rows,
int filter_cols,
int out_cols,
int out_depth,
365 const int64_t padded_filter_inner_dim_size,
const int64_t out_r,
366 const int64_t out_c,
const T *filter,
const T *input_buffer, T *output)
368 typedef typename Eigen::internal::packet_traits<T>::type Packet;
369 static const int64_t kPacketSize = (
sizeof(Packet) /
sizeof(T));
371 const int64_t filter_spatial_size =
static_cast<int64_t
>(filter_rows) * filter_cols;
372 const int64_t output_scalar_size = out_depth % kPacketSize;
373 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
374 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
376 for (
int i = 0; i < output_vectorized_size; i += kPacketSize)
379 auto vaccum = Eigen::internal::pset1<Packet>(
static_cast<T
>(0));
380 for (
int j = 0; j < filter_spatial_size; ++j)
383 const int64_t index = i + j * padded_filter_inner_dim_size;
387 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
389 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
391 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
394 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
397 if (output_scalar_size > 0)
399 auto vaccum = Eigen::internal::pset1<Packet>(
static_cast<T
>(0));
400 for (
int j = 0; j < filter_spatial_size; ++j)
402 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
403 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
404 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
405 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
408 T out_buf[kPacketSize];
409 Eigen::internal::pstoreu<T>(out_buf, vaccum);
410 const int64_t last_output_index = base_output_index + output_vectorized_size;
411 for (
int j = 0; j < output_scalar_size; ++j)
413 output[last_output_index + j] = out_buf[j];
432 typedef typename Eigen::internal::packet_traits<T>::type
Packet;
434 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
435 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
436 int out_rows,
int out_cols,
int out_depth,
const T *input,
437 const T *depthwise_filter, T *padded_filter_data,
bool pad_filter, T *in_buf,
448 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
449 pad_rows, pad_cols, out_rows, out_cols, out_depth, depthwise_filter, padded_filter_data);
451 const T *filter_data = pad_filter ? padded_filter_data : depthwise_filter;
454 auto shard = [d, in_rows, in_cols, in_depth, out_rows, out_cols, out_depth, batch, filter_rows,
455 filter_cols, depth_multiplier, stride, pad_rows, pad_cols, input, filter_data,
456 in_buf, output](int64_t start, int64_t limit) {
457 int cur_id = d.currentThreadId() + 1;
458 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
460 static const int64_t kPacketSize = (
sizeof(
Packet) /
sizeof(T));
461 const int64_t input_image_size =
static_cast<int64_t
>(in_rows) * in_cols * in_depth;
462 const int64_t output_image_size =
static_cast<int64_t
>(out_rows) * out_cols * out_depth;
463 const int64_t filter_spatial_size =
static_cast<int64_t
>(filter_rows) * filter_cols;
464 const int64_t padded_filter_inner_dim_size =
465 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
466 const int64_t padded_filter_size = filter_spatial_size * padded_filter_inner_dim_size;
468 T *input_buffer_data = in_buf + cur_id * padded_filter_size;
470 for (int64_t i = start; i < limit; ++i)
472 const int64_t b = i / out_rows;
473 const int64_t in_base = b * input_image_size;
474 const int64_t out_base = b * output_image_size;
476 const int64_t out_r = i % out_rows;
478 for (int64_t out_c = 0; out_c < out_cols; ++out_c)
482 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
483 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_filter_inner_dim_size, out_r,
484 out_c, input + in_base, input_buffer_data);
488 padded_filter_inner_dim_size, out_r, out_c, filter_data,
489 input_buffer_data, output + out_base);
494 const int64_t total_shards =
static_cast<int64_t
>(batch) * out_rows;
498 const float kCostMultiplier = 2.5f;
502 const int64_t shard_cost = kCostMultiplier * out_cols * out_depth;
504 const int64_t input_bytes =
static_cast<int64_t
>(in_rows) * in_cols * in_depth *
sizeof(T);
505 const int64_t output_bytes =
static_cast<int64_t
>(out_rows) * out_cols * out_depth *
sizeof(T);
506 const Eigen::TensorOpCost cost(input_bytes, output_bytes, shard_cost);
507 d.parallelFor(total_shards, cost, shard);
538 int stride_,
int pad_rows_,
int pad_cols_,
int out_rows_,
539 int out_cols_,
int out_depth,
540 const int64_t padded_filter_inner_dim_size,
const int64_t in_r,
541 const int64_t in_c,
const T *out_backprop, T *buffer)
543 typedef typename Eigen::internal::packet_traits<T>::type Packet;
544 static const int64_t kPacketSize = (
sizeof(Packet) /
sizeof(T));
546 const int64_t stride = stride_;
547 const int64_t filter_rows = filter_rows_;
548 const int64_t filter_cols = filter_cols_;
549 const int64_t pad_rows = pad_rows_;
550 const int64_t pad_cols = pad_cols_;
551 const int64_t out_rows = out_rows_;
552 const int64_t out_cols = out_cols_;
555 const int64_t out_r_start =
556 std::max(
static_cast<int64_t
>(0), (in_r - filter_rows + pad_rows + stride) / stride);
557 const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
558 const int64_t out_c_start =
559 std::max(
static_cast<int64_t
>(0), (in_c - filter_cols + pad_cols + stride) / stride);
560 const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
563 const int64_t filter_spatial_size = filter_rows * filter_cols;
564 if ((out_r_end - out_r_start + 1) < filter_rows || (out_c_end - out_c_start + 1) < filter_cols)
566 memset(buffer, 0, filter_spatial_size * padded_filter_inner_dim_size *
sizeof(T));
570 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
571 const int64_t scalar_size = out_depth % kPacketSize;
572 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
574 for (
int out_r = out_r_start; out_r <= out_r_end; ++out_r)
576 const int64_t f_r = in_r + pad_rows - out_r * stride;
577 for (
int out_c = out_c_start; out_c <= out_c_end; ++out_c)
579 const int64_t f_c = in_c + pad_cols - out_c * stride;
580 const int64_t buf_base = (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
582 auto *out_bprop = out_backprop + (out_r * out_cols + out_c) * out_depth;
585 for (int64_t d = 0; d < vectorized_size; d += kPacketSize)
587 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
588 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
591 for (int64_t d = 0; d < scalar_size; ++d)
593 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
596 for (int64_t d = 0; d < pad_size; ++d)
598 buffer[buf_base + vectorized_size + scalar_size + d] =
static_cast<T
>(0);
637 int depth_multiplier_,
int,
int,
int,
int,
int,
int out_depth_,
638 const int64_t padded_filter_inner_dim_size,
const int64_t in_r,
639 const int64_t in_c,
const T *filter,
const T *buffer, T *out_buffer,
642 typedef typename Eigen::internal::packet_traits<T>::type Packet;
643 static const int64_t kPacketSize = (
sizeof(Packet) /
sizeof(T));
645 const int64_t in_depth = in_depth_;
646 const int64_t depth_multiplier = depth_multiplier_;
647 const int64_t out_depth = out_depth_;
648 const int64_t filter_spatial_size = filter_rows * filter_cols;
651 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
652 const int64_t output_scalar_size = out_depth % kPacketSize;
655 const int64_t base_output_index = (in_r * in_cols + in_c) * in_depth;
659 const int64_t dm_vectorized_size = (depth_multiplier / kPacketSize) * kPacketSize;
660 const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
662 for (
int i = 0; i < output_vectorized_size; i += kPacketSize)
665 auto vaccum = Eigen::internal::pset1<Packet>(
static_cast<T
>(0));
666 for (
int j = 0; j < filter_spatial_size; ++j)
669 const int64_t index = i + j * padded_filter_inner_dim_size;
671 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
673 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
675 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
677 if (depth_multiplier == 1)
680 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
685 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
689 if (output_scalar_size > 0)
691 auto vaccum = Eigen::internal::pset1<Packet>(
static_cast<T
>(0));
692 for (
int j = 0; j < filter_spatial_size; ++j)
694 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
695 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
696 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
697 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
700 T out_buf[kPacketSize];
701 Eigen::internal::pstoreu<T>(out_buf, vaccum);
702 if (depth_multiplier == 1)
705 for (
int j = 0; j < output_scalar_size; ++j)
707 output[base_output_index + output_vectorized_size + j] = out_buf[j];
713 for (
int j = 0; j < output_scalar_size; ++j)
715 out_buffer[output_vectorized_size + j] = out_buf[j];
721 if (depth_multiplier > 1)
723 for (int64_t d = 0; d < in_depth; ++d)
725 const int64_t index = d * depth_multiplier;
726 T accum =
static_cast<T
>(0);
727 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize)
729 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
730 accum += Eigen::internal::predux(v);
733 for (int64_t dm = 0; dm < dm_scalar_size; ++dm)
735 accum += out_buffer[index + dm_vectorized_size + dm];
738 output[base_output_index + d] = accum;
747 typedef typename Eigen::internal::packet_traits<T>::type
Packet;
749 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
750 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
751 int out_rows,
int out_cols,
int out_depth,
const T *out_backprop,
752 const T *depthwise_filter, T *padded_filter_data, T *in_backprop,
bool pad_filter,
753 T *out_bprop, T *in_bprop)
762 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
763 pad_rows, pad_cols, out_rows, out_cols, out_depth, depthwise_filter, padded_filter_data);
765 const T *filter_data = pad_filter ? padded_filter_data : depthwise_filter;
768 auto shard = [d, in_rows, in_cols, in_depth, out_rows, out_cols, out_depth, batch, filter_rows,
769 filter_cols, depth_multiplier, stride, pad_rows, pad_cols, out_backprop,
770 filter_data, in_backprop, out_bprop, in_bprop](int64_t start, int64_t limit) {
771 static const int64_t kPacketSize = (
sizeof(
Packet) /
sizeof(T));
773 const int64_t input_image_size = in_rows * in_cols * in_depth;
774 const int64_t output_image_size = out_rows * out_cols * out_depth;
775 const int64_t filter_spatial_size = filter_rows * filter_cols;
776 const int64_t padded_filter_inner_dim_size =
777 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
778 const int64_t out_bprop_size = filter_spatial_size * padded_filter_inner_dim_size;
780 int cur_id = d.currentThreadId() + 1;
781 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
784 T *out_bprop_buf = out_bprop + cur_id * out_bprop_size;
787 T *in_bprop_buf = in_bprop + cur_id * padded_filter_inner_dim_size;
789 for (int64_t b = start; b < limit; ++b)
791 for (int64_t in_r = 0; in_r < in_rows; ++in_r)
793 for (int64_t in_c = 0; in_c < in_cols; ++in_c)
796 CopyOutputBackpropRegion<T>(batch, in_rows, in_cols, in_depth, filter_rows, filter_cols,
797 depth_multiplier, stride, pad_rows, pad_cols, out_rows,
798 out_cols, out_depth, padded_filter_inner_dim_size, in_r,
799 in_c, out_backprop + b * output_image_size, out_bprop_buf);
802 ComputeBackpropInput<T>(
803 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
804 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_filter_inner_dim_size, in_r,
805 in_c, filter_data, out_bprop_buf, in_bprop_buf, in_backprop + b * input_image_size);
811 const int64_t input_bytes = out_rows * out_cols * out_depth *
sizeof(T);
812 const int64_t output_bytes = in_rows * in_cols * in_depth *
sizeof(T);
813 const int64_t compute_cycles = in_rows * in_cols * out_depth * batch;
814 const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
815 d.parallelFor(batch, cost, shard);
821 int out_rows,
int out_cols,
int out_depth,
int stride,
822 int depth_multiplier,
int filter_rows,
int filter_cols,
823 int pad_rows,
int pad_cols,
const T *out_backprop,
824 const T *filter, T *in_backprop)
827 for (
int b = 0; b < batch; ++b)
829 for (
int in_r = 0; in_r < in_rows; ++in_r)
831 for (
int in_c = 0; in_c < in_cols; ++in_c)
833 for (
int in_d = 0; in_d < in_depth; ++in_d)
836 const int out_d_start = in_d * depth_multiplier;
837 const int out_d_end = out_d_start + depth_multiplier;
839 for (
int out_d = out_d_start; out_d < out_d_end; ++out_d)
841 const int out_r_start = std::max(0, (in_r - filter_rows + pad_rows + stride) / stride);
842 const int out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
844 for (
int out_r = out_r_start; out_r <= out_r_end; ++out_r)
846 const int out_c_start =
847 std::max(0, (in_c - filter_cols + pad_cols + stride) / stride);
848 const int out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
850 for (
int out_c = out_c_start; out_c <= out_c_end; ++out_c)
852 int f_r = in_r + pad_rows - out_r * stride;
853 int f_c = in_c + pad_cols - out_c * stride;
854 int filter_dm = out_d - out_d_start;
855 int out_backprop_offset =
856 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
858 filter_dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
859 sum += out_backprop[out_backprop_offset] * filter[filter_offset];
864 int in_backprop_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
865 in_backprop[in_backprop_offset] = sum;
898void ComputeBackpropFilter(
int,
int,
int,
int,
int filter_rows,
int filter_cols,
int,
int,
int,
int,
899 int out_rows,
int out_cols,
int out_depth_,
900 const int64_t padded_out_depth_size,
const int64_t out_r,
901 const int64_t out_c,
const T *out_backprop,
const T *input_buffer,
904 typedef typename Eigen::internal::packet_traits<T>::type Packet;
905 static const int64_t kPacketSize = (
sizeof(Packet) /
sizeof(T));
907 const int64_t out_depth = out_depth_;
908 const int64_t filter_spatial_size = filter_rows * filter_cols;
909 const int64_t output_vectorized_size = (padded_out_depth_size / kPacketSize) * kPacketSize;
910 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
912 const int64_t output_image_size = out_rows * out_cols * out_depth;
913 const int64_t output_last_vector_index =
914 output_image_size - (filter_spatial_size * padded_out_depth_size);
915 const bool fast_path = base_output_index <= output_last_vector_index;
921 for (
int i = 0; i < output_vectorized_size; i += kPacketSize)
924 const auto out_bprop_block =
925 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
926 for (
int j = 0; j < filter_spatial_size; ++j)
928 const int64_t index = i + j * padded_out_depth_size;
930 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
932 auto out_block_data = output_buffer + index;
933 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
935 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
937 Eigen::internal::pstoreu<T>(out_block_data, out_block);
944 for (
int i = 0; i < output_vectorized_size; i += kPacketSize)
947 const int64_t out_bprop_index = base_output_index + i;
948 const int64_t out_bprop_limit = std::min(output_image_size, out_bprop_index + kPacketSize);
949 T out_buf[kPacketSize];
950 memset(&out_buf, 0, kPacketSize *
sizeof(T));
951 const int64_t scalar_size = out_bprop_limit - out_bprop_index;
952 for (int64_t j = 0; j < scalar_size; ++j)
954 out_buf[j] = out_backprop[out_bprop_index + j];
957 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
958 for (
int j = 0; j < filter_spatial_size; ++j)
960 const int64_t index = i + j * padded_out_depth_size;
962 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
964 auto out_block_data = output_buffer + index;
965 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
967 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
969 Eigen::internal::pstoreu<T>(out_block_data, out_block);
977 typedef typename Eigen::internal::packet_traits<T>::type
Packet;
979 void operator()(
int batch,
int in_rows,
int in_cols,
int in_depth,
int filter_rows,
980 int filter_cols,
int depth_multiplier,
int stride,
int pad_rows,
int pad_cols,
981 int out_rows,
int out_cols,
int out_depth,
const T *out_backprop,
const T *input,
982 T *filter_backprop, T *padded_filter_data, T *in_bprop)
986 static const int64_t kPacketSize = (
sizeof(
Packet) /
sizeof(T));
988 const int64_t filter_spatial_size = filter_rows * filter_cols;
989 const int64_t padded_out_depth_size =
990 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
992 T *output_buffer_data = padded_filter_data;
996 auto shard = [&](int64_t start, int64_t limit) {
997 static const int64_t kPacketSize = (
sizeof(
Packet) /
sizeof(T));
998 const int64_t filter_spatial_size = filter_rows * filter_cols;
999 const int64_t padded_out_depth_size =
1000 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
1002 int cur_id = d.currentThreadId() + 1;
1003 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
1005 const int64_t input_image_size = in_rows * in_cols * in_depth;
1006 const int64_t output_image_size = out_rows * out_cols * out_depth;
1007 const int64_t padded_filter_size = filter_spatial_size * padded_out_depth_size;
1009 T *input_buffer_data = in_bprop + cur_id * padded_filter_size;
1011 for (
int b = start; b < limit; ++b)
1014 auto *output_buffer = output_buffer_data + b * padded_filter_size;
1015 memset(output_buffer, 0, padded_filter_size *
sizeof(T));
1017 for (
int out_r = 0; out_r < out_rows; ++out_r)
1019 for (
int out_c = 0; out_c < out_cols; ++out_c)
1023 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
1024 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_out_depth_size, out_r,
1025 out_c, input + b * input_image_size, input_buffer_data);
1028 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
1029 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_out_depth_size, out_r,
1030 out_c, out_backprop + b * output_image_size, input_buffer_data, output_buffer);
1036 const int64_t input_bytes = in_rows * in_cols * in_depth *
sizeof(T);
1037 const int64_t output_bytes = out_rows * out_cols * out_depth *
sizeof(T);
1038 const int64_t compute_cycles = out_rows * out_cols * out_depth * batch;
1039 const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
1040 d.parallelFor(batch, cost, shard);
1044 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
1045 const int64_t scalar_size = out_depth - vectorized_size;
1046 const int64_t padded_filter_size = filter_spatial_size * padded_out_depth_size;
1047 memset(filter_backprop, 0, filter_spatial_size * out_depth *
sizeof(T));
1049 for (int64_t i = 0; i < filter_spatial_size; ++i)
1051 const int64_t buffer_base = i * padded_out_depth_size;
1052 const int64_t output_base = i * out_depth;
1054 for (int64_t j = 0; j < vectorized_size; j += kPacketSize)
1057 auto out_block_data = filter_backprop + output_base + j;
1058 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
1059 for (
int b = 0; b < batch; ++b)
1062 const auto *output_buffer = output_buffer_data + b * padded_filter_size;
1063 const auto v = Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
1065 out_block = Eigen::internal::padd<Packet>(out_block, v);
1068 Eigen::internal::pstoreu<T>(out_block_data, out_block);
1071 for (int64_t j = 0; j < scalar_size; ++j)
1073 for (
int b = 0; b < batch; ++b)
1075 const auto *output_buffer = output_buffer_data + b * padded_filter_size;
1076 filter_backprop[output_base + vectorized_size + j] +=
1077 output_buffer[buffer_base + vectorized_size + j];
1084template <
typename T>
1086 int out_rows,
int out_cols,
int out_depth,
int stride,
1087 int depth_multiplier,
int filter_rows,
int filter_cols,
1088 int pad_rows,
int pad_cols,
const T *out_backprop,
1089 const T *input, T *filter_backprop)
1091 int num_filter_backprop = filter_rows * filter_cols * in_depth * depth_multiplier;
1092 memset(filter_backprop, 0, num_filter_backprop *
sizeof(T));
1094 for (
int b = 0; b < batch; ++b)
1096 for (
int out_r = 0; out_r < out_rows; ++out_r)
1098 for (
int out_c = 0; out_c < out_cols; ++out_c)
1100 for (
int out_d = 0; out_d < out_depth; ++out_d)
1102 const int in_d = out_d / depth_multiplier;
1103 const int dm = out_d % depth_multiplier;
1104 const int in_r_start = out_r * stride - pad_rows;
1105 const int in_c_start = out_c * stride - pad_cols;
1107 for (
int f_r = 0; f_r < filter_rows; ++f_r)
1109 for (
int f_c = 0; f_c < filter_cols; ++f_c)
1111 const int in_r = in_r_start + f_r;
1112 const int in_c = in_c_start + f_c;
1114 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
1116 int out_backprop_offset =
1117 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
1118 int input_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
1119 int filter_backprop_offset =
1120 dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
1121 filter_backprop[filter_backprop_offset] +=
1122 input[input_offset] * out_backprop[out_backprop_offset];
Eigen::ThreadPoolDevice CPUDevice
void CopyOutputBackpropRegion(int, int, int, int, int filter_rows_, int filter_cols_, int, int stride_, int pad_rows_, int pad_cols_, int out_rows_, int out_cols_, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *out_backprop, T *buffer)
void ComputeBackpropFilter(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int out_rows, int out_cols, int out_depth_, const int64_t padded_out_depth_size, const int64_t out_r, const int64_t out_c, const T *out_backprop, const T *input_buffer, T *output_buffer)
void DepthwiseConvBackpropInputReference(int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *filter, T *in_backprop)
void DepthwiseConvBackpropFilterReference(int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *input, T *filter_backprop)
void ComputeBackpropInput(int, int, int in_cols, int in_depth_, int filter_rows, int filter_cols, int depth_multiplier_, int, int, int, int, int, int out_depth_, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *filter, const T *buffer, T *out_buffer, T *output)
const Eigen::ThreadPoolDevice * GetThreadPoolDevice()
static void Run(int filter_rows, int filter_cols, int out_cols, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t out_r, const int64_t out_c, const T *filter, const T *input_buffer, T *output)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input, T *filter_backprop, T *padded_filter_data, T *in_bprop)
Eigen::internal::packet_traits< T >::type Packet
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input, T *filter_backprop)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *input, const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf, T *output)
Eigen::internal::packet_traits< T >::type Packet
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *input, const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf, T *output)
void operator()(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int, int, int out_depth, const T *filter, T *padded_filter)