ONE - On-device Neural Engine
Loading...
Searching...
No Matches
nnfw::cker::depthwise_conv_op::DepthwiseConv2DKernel< T > Struct Template Reference

#include <depthwise_conv_op.h>

Static Public Member Functions

static void Run (int filter_rows, int filter_cols, int out_cols, int out_depth, const int64_t padded_filter_inner_dim_size, const int64_t out_r, const int64_t out_c, const T *filter, const T *input_buffer, T *output)
 

Detailed Description

template<typename T>
struct nnfw::cker::depthwise_conv_op::DepthwiseConv2DKernel< T >

Definition at line 362 of file depthwise_conv_op.h.

Member Function Documentation

◆ Run()

template<typename T >
static void nnfw::cker::depthwise_conv_op::DepthwiseConv2DKernel< T >::Run ( int  filter_rows,
int  filter_cols,
int  out_cols,
int  out_depth,
const int64_t  padded_filter_inner_dim_size,
const int64_t  out_r,
const int64_t  out_c,
const T *  filter,
const T *  input_buffer,
T *  output 
)
inlinestatic

Definition at line 364 of file depthwise_conv_op.h.

367 {
368 typedef typename Eigen::internal::packet_traits<T>::type Packet;
369 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
370
371 const int64_t filter_spatial_size = static_cast<int64_t>(filter_rows) * filter_cols;
372 const int64_t output_scalar_size = out_depth % kPacketSize;
373 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
374 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
375
376 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
377 {
378 // Reset accumulator.
379 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
380 for (int j = 0; j < filter_spatial_size; ++j)
381 {
382 // Calculate index.
383 const int64_t index = i + j * padded_filter_inner_dim_size;
384 // Load filter.
385 // TODO(andydavis) Unroll 'out_c' loop in caller so we can load
386 // multiple inputs here to amortize the cost of each filter block load.
387 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
388 // Load input.
389 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
390 // Vector multiply-add.
391 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
392 }
393 // Store vector accumulator to output.
394 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
395 }
396
397 if (output_scalar_size > 0)
398 {
399 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
400 for (int j = 0; j < filter_spatial_size; ++j)
401 {
402 const int64_t index = output_vectorized_size + j * padded_filter_inner_dim_size;
403 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
404 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
405 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
406 }
407 // Load accumulator into an array and loop through output.
408 T out_buf[kPacketSize];
409 Eigen::internal::pstoreu<T>(out_buf, vaccum);
410 const int64_t last_output_index = base_output_index + output_vectorized_size;
411 for (int j = 0; j < output_scalar_size; ++j)
412 {
413 output[last_output_index + j] = out_buf[j];
414 }
415 }
416 }
loco::GraphInputIndex index(const TFPlaceholder *node)
Definition TFNode.cpp:54

Referenced by nnfw::cker::depthwise_conv_op::LaunchDepthwiseConvOp< CPUDevice, T >::operator()().


The documentation for this struct was generated from the following file: