ONE - On-device Neural Engine
Loading...
Searching...
No Matches
depthwise_conv_op.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_EIGEN_DEPTHWISE_CONV_OP_H__
19#define __NNFW_CKER_EIGEN_DEPTHWISE_CONV_OP_H__
20
21// From tensorflow/core/kernels/depthwise_conv_grad_op.cc
22#define EIGEN_USE_THREADS
23
24#include <thread>
25#include "unsupported/Eigen/CXX11/Tensor"
27
28// From tensorflow/core/kernels/depthwise_conv_op.h
29namespace nnfw
30{
31namespace cker
32{
33namespace depthwise_conv_op
34{
35
36template <typename Device, typename T> struct LaunchDepthwiseConvOp
37{
38 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
39 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
40 int out_rows, int out_cols, int out_depth, const T *input,
41 const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf,
42 T *output);
43};
44
45template <typename Device, typename T> struct LaunchDepthwiseConvBackpropInputOp
46{
47 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
48 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
49 int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *filter,
50 T *in_backprop);
51};
52
53template <typename Device, typename T> struct LaunchDepthwiseConvBackpropFilterOp
54{
55 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
56 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
57 int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input,
58 T *filter_backprop);
59};
60
61namespace functor
62{
63
64// Pads 'filter' to vector-register boundary along its inner dimension:
65// filter_inner_dim_size = in_depth * depth_multiplier
66// Requires 'filter' to have the following storage order:
67// [filter_rows, filter_cols, in_depth, depth_multiplier]
68// Returns zero-padded filter in 'padded_filter'.
69//
70// EX:
71// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
72// So we have a total of 3 * 2 = 6 filters, each of spatial size 2 x 2.
73//
74// filter [rows, cols, in_depth, depth_multiplier]
75// [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1]
76// [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3]
77//
78// padded_filter [rows, cols, in_depth, depth_multiplier]
79// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
80// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
81
82template <typename T> struct DepthwiseFilterPadOp
83{
84 void operator()(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int,
85 int, int out_depth, const T *filter, T *padded_filter)
86 {
87 typedef typename Eigen::internal::packet_traits<T>::type Packet;
88 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
89
90 // Calculate vectorized and scalar lengths of filter's inner dimension.
91 const int64_t filter_inner_dim_size = out_depth;
92 const int64_t vectorized_size = (filter_inner_dim_size / kPacketSize) * kPacketSize;
93 const int64_t scalar_size = filter_inner_dim_size - vectorized_size;
94 // Calculate required padding and padded output buffer stride.
95 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
96 const int64_t padded_filter_stride = vectorized_size + kPacketSize;
97
98 const int64_t filter_spatial_size = filter_rows * filter_cols;
99 for (int64_t i = 0; i < filter_spatial_size; ++i)
100 {
101 const int64_t input_base = i * filter_inner_dim_size;
102 const int64_t output_base = i * padded_filter_stride;
103 // Write vectorized length of filter's inner dimension to output.
104 for (int64_t j = 0; j < vectorized_size; j += kPacketSize)
105 {
106 const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j);
107 Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v);
108 }
109 // Write scalar length of filter's inner dimension to output.
110 for (int64_t j = 0; j < scalar_size; ++j)
111 {
112 padded_filter[output_base + vectorized_size + j] = filter[input_base + vectorized_size + j];
113 }
114 // Pad the remainder of output to vector-register boundary.
115 for (int64_t j = 0; j < pad_size; ++j)
116 {
117 padded_filter[output_base + vectorized_size + scalar_size + j] = static_cast<T>(0);
118 }
119 }
120 }
121};
122
123// Copies data from local region in 'input' specified by 'out_r' and 'out_'c'
124// to 'input_buffer'. The copied data is replicated by factor
125// 'depth_multiplier', and padded to vector register-width boundaries so
126// that it is aligned for efficient traversal and vector multiply-add by the
127// depthwise kernel.
128//
129// EX:
130// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
131//
132// input: [batch, in_rows, in_cols, in_depth]
133//
134// [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...]
135//
136// input_buffer (register boundaries shown):
137// [a0, a0, a1, a1] [a2, a2, 0, 0] in_row = 0, in_col = 0
138// [b0, b0, b1, b1] [b2, b2, 0, 0] in_row = 0, in_col = 1
139// [e0, e0, e1, e1] [e2, e2, 0, 0] in_row = 1, in_col = 0
140// [f0, f0, f1, f1] [f2, f2, 0, 0] in_row = 1, in_col = 1
141//
142// Returns replicated and padded data from specified input region in
143// 'input_buffer'.
144
145template <typename T> struct DepthwiseInputCopyOp
146{
147 void operator()(int, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols,
148 int depth_multiplier, int stride, int pad_rows, int pad_cols, int, int,
149 int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t out_r,
150 const int64_t out_c, const T *input, T *input_buffer)
151 {
152 typedef typename Eigen::internal::packet_traits<T>::type Packet;
153 static const int64_t kPacketSize = Eigen::internal::packet_traits<T>::size;
154
155 const int64_t kDepth = depth_multiplier;
156 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
157 const int64_t input_vectorized_size = (in_depth / kPacketSize) * kPacketSize;
158 const int64_t input_scalar_size = in_depth - input_vectorized_size;
159
160 // Calculate output padding length.
161 const int64_t output_scalar_size = out_depth % kPacketSize;
162 const int64_t output_pad_size = output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0;
163
164 // Iterate through all rows x cols reading 'in_depth' from 'input' and
165 // replicating by 'depth_multiplier' into 'input_buffer' (otherwise
166 // zero-padding input buffer as needed).
167 auto *in_buf = input_buffer;
168 const int64_t in_r_start = out_r * stride - pad_rows;
169 const int64_t in_c_start = out_c * stride - pad_cols;
170
171 // TODO: add a ploaddup variant for depth == 2 if needed.
172 if (kDepth > 1 && kDepth <= kPacketSize)
173 {
174 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
175 {
176 const int64_t in_r = in_r_start + f_r;
177
178 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
179 {
180 const int64_t in_c = in_c_start + f_c;
181
182 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
183 {
184 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
185 int64_t limit = in_depth;
186 // This will overwrite up to kPacketSize next elements,
187 // this is ok on all iterations except the last one, since
188 // we will write correct values on a next iteration.
189 if (f_c == filter_cols - 1)
190 {
191 limit -= (kPacketSize - kDepth) / kDepth + 1;
192 if (limit < 0)
193 {
194 limit = 0;
195 }
196 }
197 // Copy vectorized portion of inner dimension.
198 for (int64_t d = 0; d < limit; d++)
199 {
200 const auto p = Eigen::internal::pset1<Packet>(in[d]);
201 Eigen::internal::pstoreu<T>(in_buf, p);
202 in_buf += kDepth;
203 }
204
205 // Copy the scalar portion.
206 for (int64_t d = limit; d < in_depth; d++)
207 {
208 const auto value = in[d];
209 for (int64_t dm = 0; dm < kDepth; dm++)
210 {
211 in_buf[dm] = value;
212 }
213 in_buf += kDepth;
214 }
215
216 // Pad the remainder of the output to vector register boundary.
217 for (int64_t d = 0; d < output_pad_size; ++d)
218 {
219 in_buf[d] = static_cast<T>(0);
220 }
221 in_buf += output_pad_size;
222 }
223 else
224 {
225 // Zero pad.
226 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
227 in_buf += padded_filter_inner_dim_size;
228 }
229 }
230 }
231 }
232 else if (kDepth > kPacketSize)
233 {
234 // Calculate vectorized and scalar (residual) lengths for
235 // 'depth_multiplier'. This is used to efficiently replicate data for
236 // when 'depth_multiplier' > kPacketSize.
237 const int64_t dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize;
238
239 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
240 {
241 const int64_t in_r = in_r_start + f_r;
242
243 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
244 {
245 const int64_t in_c = in_c_start + f_c;
246
247 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
248 {
249 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
250 // Copy vectorized portion of inner dimension.
251 for (int64_t d = 0; d < in_depth; d++)
252 {
253 const auto p = Eigen::internal::pset1<Packet>(in[d]);
254 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize)
255 {
256 Eigen::internal::pstoreu<T>(in_buf + dm, p);
257 }
258 // Overlapping store for the remainder.
259 Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p);
260 in_buf += kDepth;
261 }
262 // Pad the remainder of the output to vector register boundary.
263 for (int64_t d = 0; d < output_pad_size; ++d)
264 {
265 in_buf[d] = static_cast<T>(0);
266 }
267 in_buf += output_pad_size;
268 }
269 else
270 {
271 // Zero pad.
272 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
273 in_buf += padded_filter_inner_dim_size;
274 }
275 }
276 }
277 }
278 else if (kDepth == 1)
279 {
280 for (int64_t f_r = 0; f_r < filter_rows; ++f_r)
281 {
282 const int64_t in_r = in_r_start + f_r;
283
284 for (int64_t f_c = 0; f_c < filter_cols; ++f_c)
285 {
286 const int64_t in_c = in_c_start + f_c;
287
288 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
289 {
290 const auto *in = input + (in_r * in_cols + in_c) * in_depth;
291 for (int64_t d = 0; d < input_vectorized_size; d += kPacketSize)
292 {
293 const auto p = Eigen::internal::ploadu<Packet>(in + d);
294 Eigen::internal::pstoreu<T>(in_buf, p);
295 in_buf += kPacketSize;
296 }
297 for (int64_t d = 0; d < input_scalar_size; ++d)
298 {
299 T v = in[input_vectorized_size + d];
300 in_buf[d] = v;
301 }
302 in_buf += input_scalar_size;
303
304 // Pad the remainder of the output to vector register boundary.
305 for (int64_t d = 0; d < output_pad_size; ++d)
306 {
307 in_buf[d] = static_cast<T>(0);
308 }
309 in_buf += output_pad_size;
310 }
311 else
312 {
313 // Zero pad.
314 memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
315 in_buf += padded_filter_inner_dim_size;
316 }
317 }
318 }
319 }
320 }
321};
322
323} // namespace functor
324} // namespace depthwise_conv_op
325} // namespace cker
326} // namespace nnfw
327
328// From tensorflow/core/kernels/depthwise_conv_op.cc
329// From tensorflow/core/kernels/depthwise_conv_grad_op.cc
330namespace nnfw
331{
332namespace cker
333{
334namespace depthwise_conv_op
335{
336
337// Enable CPUDevice only for depthwise_conv_op
338using CPUDevice = Eigen::ThreadPoolDevice;
339
340// Computes the vectorized product of 'input_buffer' and 'filter' and stores
341// result in 'output' at location specified by 'out_r' and 'out_c'.
342//
343// EX:
344// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
345// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
346//
347// input_buffer [rows, cols, in_depth, depth_multiplier]
348// [a0, a0, a1, a1] [a2, a2, 0, 0] [b0, b0, b1, b1] [b2, b2, 0, 0]
349// [e0, e0, e1, e1] [e2, e2, 0, 0] [f0, f0, f1, f1] [f2, f2, 0, 0]
350//
351// filter [rows, cols, in_depth, depth_multiplier]
352// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
353// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
354//
355// First output register [in_depth, depth_multiplier]
356// [q0, q1, q2, q3] = ([a0, a0, a1, a1] x [u0, v0, w0, x0]) +
357// ([b0, b0, b1, b1] x [u1, v1, w1, x1]) +
358// ([e0, e0, e1, e1] x [u2, v2, w2, x2]) +
359// ([f0, f0, f1, f1] x [u3, v3, w3, x3])
360//
361// TODO(andydavis) Experiment with processing multiple inputs per input buffer.
362template <typename T> struct DepthwiseConv2DKernel
363{
364 static void Run(int filter_rows, int filter_cols, int out_cols, int out_depth,
365 const int64_t padded_filter_inner_dim_size, const int64_t out_r,
366 const int64_t out_c, const T *filter, const T *input_buffer, T *output)
367 {
368 typedef typename Eigen::internal::packet_traits<T>::type Packet;
369 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
370
371 const int64_t filter_spatial_size = static_cast<int64_t>(filter_rows) * filter_cols;
372 const int64_t output_scalar_size = out_depth % kPacketSize;
373 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
374 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
375
376 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
377 {
378 // Reset accumulator.
379 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
380 for (int j = 0; j < filter_spatial_size; ++j)
381 {
382 // Calculate index.
383 const int64_t index = i + j * padded_filter_inner_dim_size;
384 // Load filter.
385 // TODO(andydavis) Unroll 'out_c' loop in caller so we can load
386 // multiple inputs here to amortize the cost of each filter block load.
387 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
388 // Load input.
389 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
390 // Vector multiply-add.
391 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
392 }
393 // Store vector accumulator to output.
394 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
395 }
396
397 if (output_scalar_size > 0)
398 {
399 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
400 for (int j = 0; j < filter_spatial_size; ++j)
401 {
402 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
403 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
404 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
405 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
406 }
407 // Load accumulator into an array and loop through output.
408 T out_buf[kPacketSize];
409 Eigen::internal::pstoreu<T>(out_buf, vaccum);
410 const int64_t last_output_index = base_output_index + output_vectorized_size;
411 for (int j = 0; j < output_scalar_size; ++j)
412 {
413 output[last_output_index + j] = out_buf[j];
414 }
415 }
416 }
417};
418
419// Computes the depthwise conv2d of 'input' by 'depthwise_filter' and stores
420// the result in 'output'. This implementation trades off copying small patches
421// of the input to achieve better data alignment, which enables vectorized
422// load/store and multiply-add operations (see comments at InputBufferCopyOp and
423// DepthwiseConv2DKernel for details).
424//
425// TODO(andydavis) Evaluate the performance of processing multiple input
426// patches in the inner loop.
427// TODO(andydavis) Consider a zero-copy implementation for the case when
428// 'in_depth' is a multiple of register width, and 'depth_multipler' is one.
429// TODO(andydavis) Evaluate the performance of alternative implementations.
430template <typename T> struct LaunchDepthwiseConvOp<CPUDevice, T>
431{
432 typedef typename Eigen::internal::packet_traits<T>::type Packet;
433
434 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
435 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
436 int out_rows, int out_cols, int out_depth, const T *input,
437 const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf,
438 T *output)
439 {
440
441 const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice();
442
443 // Pad 'depthwise_filter' to vector register width (if needed).
444 if (pad_filter)
445 {
446 // Write out padded filter.
448 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
449 pad_rows, pad_cols, out_rows, out_cols, out_depth, depthwise_filter, padded_filter_data);
450 }
451 const T *filter_data = pad_filter ? padded_filter_data : depthwise_filter;
452
453 // Computes one shard of depthwise conv2d output.
454 auto shard = [d, in_rows, in_cols, in_depth, out_rows, out_cols, out_depth, batch, filter_rows,
455 filter_cols, depth_multiplier, stride, pad_rows, pad_cols, input, filter_data,
456 in_buf, output](int64_t start, int64_t limit) {
457 int cur_id = d.currentThreadId() + 1;
458 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
459
460 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
461 const int64_t input_image_size = static_cast<int64_t>(in_rows) * in_cols * in_depth;
462 const int64_t output_image_size = static_cast<int64_t>(out_rows) * out_cols * out_depth;
463 const int64_t filter_spatial_size = static_cast<int64_t>(filter_rows) * filter_cols;
464 const int64_t padded_filter_inner_dim_size =
465 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
466 const int64_t padded_filter_size = filter_spatial_size * padded_filter_inner_dim_size;
467
468 T *input_buffer_data = in_buf + cur_id * padded_filter_size;
469
470 for (int64_t i = start; i < limit; ++i)
471 {
472 const int64_t b = i / out_rows;
473 const int64_t in_base = b * input_image_size;
474 const int64_t out_base = b * output_image_size;
475
476 const int64_t out_r = i % out_rows;
477
478 for (int64_t out_c = 0; out_c < out_cols; ++out_c)
479 {
480 // Populate 'input_buffer_data' with data from local input region.
482 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
483 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_filter_inner_dim_size, out_r,
484 out_c, input + in_base, input_buffer_data);
485
486 // Process buffered input across all filters and store to output.
487 DepthwiseConv2DKernel<T>::Run(filter_rows, filter_cols, out_cols, out_depth,
488 padded_filter_inner_dim_size, out_r, out_c, filter_data,
489 input_buffer_data, output + out_base);
490 }
491 }
492 };
493
494 const int64_t total_shards = static_cast<int64_t>(batch) * out_rows;
495
496 // Empirically tested to give reasonable performance boosts at batch size 1
497 // without reducing throughput at batch size 32.
498 const float kCostMultiplier = 2.5f;
499
500 // TODO(andydavis): Estimate shard cost (in cycles) based on the number of
501 // flops/loads/stores required to compute one shard.
502 const int64_t shard_cost = kCostMultiplier * out_cols * out_depth;
503
504 const int64_t input_bytes = static_cast<int64_t>(in_rows) * in_cols * in_depth * sizeof(T);
505 const int64_t output_bytes = static_cast<int64_t>(out_rows) * out_cols * out_depth * sizeof(T);
506 const Eigen::TensorOpCost cost(input_bytes, output_bytes, shard_cost);
507 d.parallelFor(total_shards, cost, shard);
508 }
509};
510
511// Copies data from local region in 'out_backprop' into 'buffer'.
512// The local region coordinates are calculated as the set of output points which
513// used the input point ('in_r', 'in_'c') as input during the forward pass.
514// Rather than spatially reversing the filter, the input is reversed during
515// the copy. The copied data is padded to vector register-width boundaries so
516// that it is aligned for efficient traversal and vector multiply-add by the
517// depthwise input kernel.
518//
519// EX:
520// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
521//
522// 'out_backprop': [batch, out_rows, out_cols, out_depth]
523//
524// [a00, a01, a10, a11] [a20, a21, b00, b01]
525// [b10, b11, b20, b21] [...]
526// [e00, e01, e10, e11] [e20, e21, f00, f01]
527// [f10, f11, f20, f21] [...]
528//
529// 'buffer' (register boundaries shown):
530//
531// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
532// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
533// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
534// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
535//
536template <typename T>
537void CopyOutputBackpropRegion(int, int, int, int, int filter_rows_, int filter_cols_, int,
538 int stride_, int pad_rows_, int pad_cols_, int out_rows_,
539 int out_cols_, int out_depth,
540 const int64_t padded_filter_inner_dim_size, const int64_t in_r,
541 const int64_t in_c, const T *out_backprop, T *buffer)
542{
543 typedef typename Eigen::internal::packet_traits<T>::type Packet;
544 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
545
546 const int64_t stride = stride_;
547 const int64_t filter_rows = filter_rows_;
548 const int64_t filter_cols = filter_cols_;
549 const int64_t pad_rows = pad_rows_;
550 const int64_t pad_cols = pad_cols_;
551 const int64_t out_rows = out_rows_;
552 const int64_t out_cols = out_cols_;
553
554 // Calculate the output spatial region which used point (in_r, in_c) as input.
555 const int64_t out_r_start =
556 std::max(static_cast<int64_t>(0), (in_r - filter_rows + pad_rows + stride) / stride);
557 const int64_t out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
558 const int64_t out_c_start =
559 std::max(static_cast<int64_t>(0), (in_c - filter_cols + pad_cols + stride) / stride);
560 const int64_t out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
561
562 // Zero-pad 'buffer' if output region is smaller than filter spatial size.
563 const int64_t filter_spatial_size = filter_rows * filter_cols;
564 if ((out_r_end - out_r_start + 1) < filter_rows || (out_c_end - out_c_start + 1) < filter_cols)
565 {
566 memset(buffer, 0, filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
567 }
568
569 // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
570 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
571 const int64_t scalar_size = out_depth % kPacketSize;
572 const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
573
574 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r)
575 {
576 const int64_t f_r = in_r + pad_rows - out_r * stride;
577 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c)
578 {
579 const int64_t f_c = in_c + pad_cols - out_c * stride;
580 const int64_t buf_base = (f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
581 // Calculate index into 'out_backprop' for coordinate (out_r, out_c).
582 auto *out_bprop = out_backprop + (out_r * out_cols + out_c) * out_depth;
583
584 // Copy vectorized portion of inner dimension into 'buffer'.
585 for (int64_t d = 0; d < vectorized_size; d += kPacketSize)
586 {
587 auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
588 Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
589 }
590 // Copy scalar portion of out_bprop to 'buffer'
591 for (int64_t d = 0; d < scalar_size; ++d)
592 {
593 buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
594 }
595 // Pad to vector-register width (if needed).
596 for (int64_t d = 0; d < pad_size; ++d)
597 {
598 buffer[buf_base + vectorized_size + scalar_size + d] = static_cast<T>(0);
599 }
600 }
601 }
602}
603
604// Computes the vectorized product of 'buffer' and 'filter' and stores
605// result in 'output' at location computed from 'in_r' and 'in_c'.
606// If depth_multiplier is > 1, the intermediate output is reduced along
607// the depth_multiplier dimension.
608//
609// EX:
610// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
611// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
612//
613// 'buffer' [rows, cols, in_depth, depth_multiplier]
614//
615// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
616// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
617// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
618// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
619//
620// filter [rows, cols, in_depth, depth_multiplier]
621// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
622// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
623//
624// First output register [in_depth, depth_multiplier]
625// [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
626// ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
627// ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
628// ([a00, a01, a10, a11] x [u3, v3, w3, x3])
629//
630// Reduction step along depth-multiplier dimension:
631//
632// [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
633//
634
635template <typename T>
636void ComputeBackpropInput(int, int, int in_cols, int in_depth_, int filter_rows, int filter_cols,
637 int depth_multiplier_, int, int, int, int, int, int out_depth_,
638 const int64_t padded_filter_inner_dim_size, const int64_t in_r,
639 const int64_t in_c, const T *filter, const T *buffer, T *out_buffer,
640 T *output)
641{
642 typedef typename Eigen::internal::packet_traits<T>::type Packet;
643 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
644
645 const int64_t in_depth = in_depth_;
646 const int64_t depth_multiplier = depth_multiplier_;
647 const int64_t out_depth = out_depth_;
648 const int64_t filter_spatial_size = filter_rows * filter_cols;
649
650 // Calculate vectorized and scalar lengths of 'out_depth'.
651 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
652 const int64_t output_scalar_size = out_depth % kPacketSize;
653
654 // Calculate base index at which to begin writing output.
655 const int64_t base_output_index = (in_r * in_cols + in_c) * in_depth;
656
657 // Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
658 // used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
659 const int64_t dm_vectorized_size = (depth_multiplier / kPacketSize) * kPacketSize;
660 const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
661
662 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
663 {
664 // Reset accumulator.
665 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
666 for (int j = 0; j < filter_spatial_size; ++j)
667 {
668 // Calculate index.
669 const int64_t index = i + j * padded_filter_inner_dim_size;
670 // Load filter.
671 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
672 // Load input.
673 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
674 // Vector multiply-add.
675 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
676 }
677 if (depth_multiplier == 1)
678 {
679 // Write directly to the output.
680 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
681 }
682 else
683 {
684 // Buffer output for subsequent reduction step.
685 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
686 }
687 }
688
689 if (output_scalar_size > 0)
690 {
691 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
692 for (int j = 0; j < filter_spatial_size; ++j)
693 {
694 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
695 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
696 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
697 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
698 }
699 // Load accumulator into an array and loop through output.
700 T out_buf[kPacketSize];
701 Eigen::internal::pstoreu<T>(out_buf, vaccum);
702 if (depth_multiplier == 1)
703 {
704 // Write directly to the output.
705 for (int j = 0; j < output_scalar_size; ++j)
706 {
707 output[base_output_index + output_vectorized_size + j] = out_buf[j];
708 }
709 }
710 else
711 {
712 // Buffer output for subsequent reduction step.
713 for (int j = 0; j < output_scalar_size; ++j)
714 {
715 out_buffer[output_vectorized_size + j] = out_buf[j];
716 }
717 }
718 }
719
720 // Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
721 if (depth_multiplier > 1)
722 {
723 for (int64_t d = 0; d < in_depth; ++d)
724 {
725 const int64_t index = d * depth_multiplier;
726 T accum = static_cast<T>(0);
727 for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize)
728 {
729 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
730 accum += Eigen::internal::predux(v);
731 }
732 // Copy scalar portion of replicated output.
733 for (int64_t dm = 0; dm < dm_scalar_size; ++dm)
734 {
735 accum += out_buffer[index + dm_vectorized_size + dm];
736 }
737 // Copy to output.
738 output[base_output_index + d] = accum;
739 }
740 }
741}
742
743// Computes the depthwise conv2d backprop input of 'out_backprop' by
744// 'depthwise_filter' and stores the result in 'in_backprop'.
745template <typename T> struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T>
746{
747 typedef typename Eigen::internal::packet_traits<T>::type Packet;
748
749 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
750 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
751 int out_rows, int out_cols, int out_depth, const T *out_backprop,
752 const T *depthwise_filter, T *padded_filter_data, T *in_backprop, bool pad_filter,
753 T *out_bprop, T *in_bprop)
754 {
755 const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice();
756
757 // Pad 'depthwise_filter' to vector register width (if needed).
758 if (pad_filter)
759 {
760 // Write out padded filter.
762 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
763 pad_rows, pad_cols, out_rows, out_cols, out_depth, depthwise_filter, padded_filter_data);
764 }
765 const T *filter_data = pad_filter ? padded_filter_data : depthwise_filter;
766
767 // Computes one shard of depthwise conv2d backprop input.
768 auto shard = [d, in_rows, in_cols, in_depth, out_rows, out_cols, out_depth, batch, filter_rows,
769 filter_cols, depth_multiplier, stride, pad_rows, pad_cols, out_backprop,
770 filter_data, in_backprop, out_bprop, in_bprop](int64_t start, int64_t limit) {
771 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
772
773 const int64_t input_image_size = in_rows * in_cols * in_depth;
774 const int64_t output_image_size = out_rows * out_cols * out_depth;
775 const int64_t filter_spatial_size = filter_rows * filter_cols;
776 const int64_t padded_filter_inner_dim_size =
777 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
778 const int64_t out_bprop_size = filter_spatial_size * padded_filter_inner_dim_size;
779
780 int cur_id = d.currentThreadId() + 1;
781 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
782
783 // Use out_bprop buffer to copy regions from 'out_backprop'.
784 T *out_bprop_buf = out_bprop + cur_id * out_bprop_size;
785
786 // Use in_bprop buffer for intermediate results.
787 T *in_bprop_buf = in_bprop + cur_id * padded_filter_inner_dim_size;
788
789 for (int64_t b = start; b < limit; ++b)
790 {
791 for (int64_t in_r = 0; in_r < in_rows; ++in_r)
792 {
793 for (int64_t in_c = 0; in_c < in_cols; ++in_c)
794 {
795 // Populate 'out_bprop_buf' from local 'out_backprop' region.
796 CopyOutputBackpropRegion<T>(batch, in_rows, in_cols, in_depth, filter_rows, filter_cols,
797 depth_multiplier, stride, pad_rows, pad_cols, out_rows,
798 out_cols, out_depth, padded_filter_inner_dim_size, in_r,
799 in_c, out_backprop + b * output_image_size, out_bprop_buf);
800
801 // Compute depthwise backprop input.
802 ComputeBackpropInput<T>(
803 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
804 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_filter_inner_dim_size, in_r,
805 in_c, filter_data, out_bprop_buf, in_bprop_buf, in_backprop + b * input_image_size);
806 }
807 }
808 }
809 };
810
811 const int64_t input_bytes = out_rows * out_cols * out_depth * sizeof(T);
812 const int64_t output_bytes = in_rows * in_cols * in_depth * sizeof(T);
813 const int64_t compute_cycles = in_rows * in_cols * out_depth * batch;
814 const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
815 d.parallelFor(batch, cost, shard);
816 }
817};
818
819template <typename T>
820void DepthwiseConvBackpropInputReference(int batch, int in_rows, int in_cols, int in_depth,
821 int out_rows, int out_cols, int out_depth, int stride,
822 int depth_multiplier, int filter_rows, int filter_cols,
823 int pad_rows, int pad_cols, const T *out_backprop,
824 const T *filter, T *in_backprop)
825{
826 // Naive for loop as a reference point without concerns about performance.
827 for (int b = 0; b < batch; ++b)
828 {
829 for (int in_r = 0; in_r < in_rows; ++in_r)
830 {
831 for (int in_c = 0; in_c < in_cols; ++in_c)
832 {
833 for (int in_d = 0; in_d < in_depth; ++in_d)
834 {
835 T sum = 0;
836 const int out_d_start = in_d * depth_multiplier;
837 const int out_d_end = out_d_start + depth_multiplier;
838
839 for (int out_d = out_d_start; out_d < out_d_end; ++out_d)
840 {
841 const int out_r_start = std::max(0, (in_r - filter_rows + pad_rows + stride) / stride);
842 const int out_r_end = std::min(out_rows - 1, (in_r + pad_rows) / stride);
843
844 for (int out_r = out_r_start; out_r <= out_r_end; ++out_r)
845 {
846 const int out_c_start =
847 std::max(0, (in_c - filter_cols + pad_cols + stride) / stride);
848 const int out_c_end = std::min(out_cols - 1, (in_c + pad_cols) / stride);
849
850 for (int out_c = out_c_start; out_c <= out_c_end; ++out_c)
851 {
852 int f_r = in_r + pad_rows - out_r * stride;
853 int f_c = in_c + pad_cols - out_c * stride;
854 int filter_dm = out_d - out_d_start;
855 int out_backprop_offset =
856 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
857 int filter_offset =
858 filter_dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
859 sum += out_backprop[out_backprop_offset] * filter[filter_offset];
860 }
861 }
862 }
863
864 int in_backprop_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
865 in_backprop[in_backprop_offset] = sum;
866 }
867 }
868 }
869 }
870}
871
872// Kernels to compute the gradients of the filters for depthwise convolution.
873
874// Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
875// result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
876//
877// EX:
878// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
879// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
880//
881// 'input_buffer' [rows, cols, in_depth, depth_multiplier]
882//
883// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
884// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
885// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
886// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
887//
888// 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
889//
890// [q00, q01, q10, q11] [q20, q21, r00, r01]
891// [r10, r11, r20, r21] [s00, s01, s10, s11]
892// [s20, s21, t00, t01] [t10, t11, t20, a21]
893//
894// First output register of 'filter_backprop'
895// [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
896//
897template <typename T>
898void ComputeBackpropFilter(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int,
899 int out_rows, int out_cols, int out_depth_,
900 const int64_t padded_out_depth_size, const int64_t out_r,
901 const int64_t out_c, const T *out_backprop, const T *input_buffer,
902 T *output_buffer)
903{
904 typedef typename Eigen::internal::packet_traits<T>::type Packet;
905 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
906 // Calculate vectorized size of 'padded_out_depth_size'.
907 const int64_t out_depth = out_depth_;
908 const int64_t filter_spatial_size = filter_rows * filter_cols;
909 const int64_t output_vectorized_size = (padded_out_depth_size / kPacketSize) * kPacketSize;
910 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
911 // Determine whether we can execute fast or slow code path.
912 const int64_t output_image_size = out_rows * out_cols * out_depth;
913 const int64_t output_last_vector_index =
914 output_image_size - (filter_spatial_size * padded_out_depth_size);
915 const bool fast_path = base_output_index <= output_last_vector_index;
916
917 if (fast_path)
918 {
919 // TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
920 // amortize the cost of 'output_buffer' load store in the loop below.
921 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
922 {
923 // Load vector register from 'out_backprop'.
924 const auto out_bprop_block =
925 Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
926 for (int j = 0; j < filter_spatial_size; ++j)
927 {
928 const int64_t index = i + j * padded_out_depth_size;
929 // Load vector register from 'input_buffer'.
930 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
931 // Load output block into vector register.
932 auto out_block_data = output_buffer + index;
933 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
934 // Vector multiply-add.
935 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
936 // Store 'out_block' back to memory.
937 Eigen::internal::pstoreu<T>(out_block_data, out_block);
938 }
939 }
940 }
941 else
942 {
943 // Slow path (cant do vector reads from non-padded 'out_backprop'.
944 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
945 {
946 // Calculate safe read size from 'out_backprop'.
947 const int64_t out_bprop_index = base_output_index + i;
948 const int64_t out_bprop_limit = std::min(output_image_size, out_bprop_index + kPacketSize);
949 T out_buf[kPacketSize];
950 memset(&out_buf, 0, kPacketSize * sizeof(T));
951 const int64_t scalar_size = out_bprop_limit - out_bprop_index;
952 for (int64_t j = 0; j < scalar_size; ++j)
953 {
954 out_buf[j] = out_backprop[out_bprop_index + j];
955 }
956 // Load vector register from 'out_buf'.
957 const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
958 for (int j = 0; j < filter_spatial_size; ++j)
959 {
960 const int64_t index = i + j * padded_out_depth_size;
961 // Load vector register from 'input_buffer'.
962 const auto input_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
963 // Load output block into vector register.
964 auto out_block_data = output_buffer + index;
965 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
966 // Vector multiply-add.
967 out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block, out_block);
968 // Store 'out_block' back to memory.
969 Eigen::internal::pstoreu<T>(out_block_data, out_block);
970 }
971 }
972 }
973}
974
975template <typename T> struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T>
976{
977 typedef typename Eigen::internal::packet_traits<T>::type Packet;
978
979 void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows,
980 int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols,
981 int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input,
982 T *filter_backprop, T *padded_filter_data, T *in_bprop)
983 {
984 const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice();
985
986 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
987
988 const int64_t filter_spatial_size = filter_rows * filter_cols;
989 const int64_t padded_out_depth_size =
990 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
991
992 T *output_buffer_data = padded_filter_data;
993
994 // Computes one shard of depthwise conv2d backprop filter.
995 // auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
996 auto shard = [&](int64_t start, int64_t limit) {
997 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
998 const int64_t filter_spatial_size = filter_rows * filter_cols;
999 const int64_t padded_out_depth_size =
1000 ((out_depth + kPacketSize - 1) / kPacketSize) * kPacketSize;
1001
1002 int cur_id = d.currentThreadId() + 1;
1003 assert(cur_id >= 0 && cur_id < d.numThreads() + 1);
1004
1005 const int64_t input_image_size = in_rows * in_cols * in_depth;
1006 const int64_t output_image_size = out_rows * out_cols * out_depth;
1007 const int64_t padded_filter_size = filter_spatial_size * padded_out_depth_size;
1008
1009 T *input_buffer_data = in_bprop + cur_id * padded_filter_size;
1010
1011 for (int b = start; b < limit; ++b)
1012 {
1013 // Initialize 'output_buffer' for 'b'.
1014 auto *output_buffer = output_buffer_data + b * padded_filter_size;
1015 memset(output_buffer, 0, padded_filter_size * sizeof(T));
1016
1017 for (int out_r = 0; out_r < out_rows; ++out_r)
1018 {
1019 for (int out_c = 0; out_c < out_cols; ++out_c)
1020 {
1021 // Populate 'input_buffer_data' with data from local input region.
1023 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
1024 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_out_depth_size, out_r,
1025 out_c, input + b * input_image_size, input_buffer_data);
1026 // Compute depthwise backprop filter.
1028 batch, in_rows, in_cols, in_depth, filter_rows, filter_cols, depth_multiplier, stride,
1029 pad_rows, pad_cols, out_rows, out_cols, out_depth, padded_out_depth_size, out_r,
1030 out_c, out_backprop + b * output_image_size, input_buffer_data, output_buffer);
1031 }
1032 }
1033 }
1034 };
1035
1036 const int64_t input_bytes = in_rows * in_cols * in_depth * sizeof(T);
1037 const int64_t output_bytes = out_rows * out_cols * out_depth * sizeof(T);
1038 const int64_t compute_cycles = out_rows * out_cols * out_depth * batch;
1039 const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
1040 d.parallelFor(batch, cost, shard);
1041
1042 // Accumulate 'output_buffer' from each shard into 'output'.
1043 // const int64_t out_depth = out_depth;
1044 const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
1045 const int64_t scalar_size = out_depth - vectorized_size;
1046 const int64_t padded_filter_size = filter_spatial_size * padded_out_depth_size;
1047 memset(filter_backprop, 0, filter_spatial_size * out_depth * sizeof(T));
1048
1049 for (int64_t i = 0; i < filter_spatial_size; ++i)
1050 {
1051 const int64_t buffer_base = i * padded_out_depth_size;
1052 const int64_t output_base = i * out_depth;
1053 // Write vectorized length of filter's inner dimension to output.
1054 for (int64_t j = 0; j < vectorized_size; j += kPacketSize)
1055 {
1056 // Load data from 'filter_backprop' into vector register.
1057 auto out_block_data = filter_backprop + output_base + j;
1058 auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
1059 for (int b = 0; b < batch; ++b)
1060 {
1061 // Load data from 'output_buffer' for 'b'.
1062 const auto *output_buffer = output_buffer_data + b * padded_filter_size;
1063 const auto v = Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
1064 // Add 'v' to 'out_block'.
1065 out_block = Eigen::internal::padd<Packet>(out_block, v);
1066 }
1067 // Store 'out_block' back to memory.
1068 Eigen::internal::pstoreu<T>(out_block_data, out_block);
1069 }
1070 // Write scalar length of filter's inner dimension to output.
1071 for (int64_t j = 0; j < scalar_size; ++j)
1072 {
1073 for (int b = 0; b < batch; ++b)
1074 {
1075 const auto *output_buffer = output_buffer_data + b * padded_filter_size;
1076 filter_backprop[output_base + vectorized_size + j] +=
1077 output_buffer[buffer_base + vectorized_size + j];
1078 }
1079 }
1080 }
1081 }
1082};
1083
1084template <typename T>
1085void DepthwiseConvBackpropFilterReference(int batch, int in_rows, int in_cols, int in_depth,
1086 int out_rows, int out_cols, int out_depth, int stride,
1087 int depth_multiplier, int filter_rows, int filter_cols,
1088 int pad_rows, int pad_cols, const T *out_backprop,
1089 const T *input, T *filter_backprop)
1090{
1091 int num_filter_backprop = filter_rows * filter_cols * in_depth * depth_multiplier;
1092 memset(filter_backprop, 0, num_filter_backprop * sizeof(T));
1093 // Naive for loop as a reference point without concerns about performance.
1094 for (int b = 0; b < batch; ++b)
1095 {
1096 for (int out_r = 0; out_r < out_rows; ++out_r)
1097 {
1098 for (int out_c = 0; out_c < out_cols; ++out_c)
1099 {
1100 for (int out_d = 0; out_d < out_depth; ++out_d)
1101 {
1102 const int in_d = out_d / depth_multiplier;
1103 const int dm = out_d % depth_multiplier;
1104 const int in_r_start = out_r * stride - pad_rows;
1105 const int in_c_start = out_c * stride - pad_cols;
1106
1107 for (int f_r = 0; f_r < filter_rows; ++f_r)
1108 {
1109 for (int f_c = 0; f_c < filter_cols; ++f_c)
1110 {
1111 const int in_r = in_r_start + f_r;
1112 const int in_c = in_c_start + f_c;
1113
1114 if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols)
1115 {
1116 int out_backprop_offset =
1117 out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
1118 int input_offset = in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
1119 int filter_backprop_offset =
1120 dm + depth_multiplier * (in_d + in_depth * (f_c + filter_cols * f_r));
1121 filter_backprop[filter_backprop_offset] +=
1122 input[input_offset] * out_backprop[out_backprop_offset];
1123 }
1124 }
1125 }
1126 }
1127 }
1128 }
1129 }
1130}
1131
1132} // namespace depthwise_conv_op
1133} // namespace cker
1134} // namespace nnfw
1135
1136#endif // __NNFW_CKER_EIGEN_DEPTHWISE_CONV_OP_H__
Eigen::ThreadPoolDevice CPUDevice
void CopyOutputBackpropRegion(int, int, int, int, int filter_rows_, int filter_cols_, int, int stride_, int pad_rows_, int pad_cols_, int out_rows_, int out_cols_, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *out_backprop, T *buffer)
void ComputeBackpropFilter(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int out_rows, int out_cols, int out_depth_, const int64_t padded_out_depth_size, const int64_t out_r, const int64_t out_c, const T *out_backprop, const T *input_buffer, T *output_buffer)
void DepthwiseConvBackpropInputReference(int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *filter, T *in_backprop)
void DepthwiseConvBackpropFilterReference(int batch, int in_rows, int in_cols, int in_depth, int out_rows, int out_cols, int out_depth, int stride, int depth_multiplier, int filter_rows, int filter_cols, int pad_rows, int pad_cols, const T *out_backprop, const T *input, T *filter_backprop)
void ComputeBackpropInput(int, int, int in_cols, int in_depth_, int filter_rows, int filter_cols, int depth_multiplier_, int, int, int, int, int, int out_depth_, const int64_t padded_filter_inner_dim_size, const int64_t in_r, const int64_t in_c, const T *filter, const T *buffer, T *out_buffer, T *output)
const Eigen::ThreadPoolDevice * GetThreadPoolDevice()
Definition topk_v2.h:30
static void Run(int filter_rows, int filter_cols, int out_cols, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t out_r, const int64_t out_c, const T *filter, const T *input_buffer, T *output)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input, T *filter_backprop, T *padded_filter_data, T *in_bprop)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *input, T *filter_backprop)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *depthwise_filter, T *padded_filter_data, T *in_backprop, bool pad_filter, T *out_bprop, T *in_bprop)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *out_backprop, const T *filter, T *in_backprop)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *input, const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf, T *output)
void operator()(int batch, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int out_rows, int out_cols, int out_depth, const T *input, const T *depthwise_filter, T *padded_filter_data, bool pad_filter, T *in_buf, T *output)
void operator()(int, int, int, int, int filter_rows, int filter_cols, int, int, int, int, int, int, int out_depth, const T *filter, T *padded_filter)
void operator()(int, int in_rows, int in_cols, int in_depth, int filter_rows, int filter_cols, int depth_multiplier, int stride, int pad_rows, int pad_cols, int, int, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t out_r, const int64_t out_c, const T *input, T *input_buffer)