18#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
19#define __NNFW_CKER_DEPTHWISE_CONV_H__
48 const T *input_data,
const Shape &filter_shape,
const T *filter_data,
50 T *output_data,
int thread_start,
int thread_end,
int thread_dim)
51 : params_(params), input_shape_(input_shape), input_data_(input_data),
52 filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
53 bias_data_(bias_data), output_shape_(
output_shape), output_data_(output_data),
54 thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
61 bias_shape_, bias_data_, output_shape_, output_data_,
62 thread_start_, thread_end_, thread_dim_);
67 const Shape &input_shape_;
69 const Shape &filter_shape_;
70 const T *filter_data_;
71 const Shape &bias_shape_;
73 const Shape &output_shape_;
85 static constexpr int kMinMulPerThread = 1 << 13;
86 const int filter_height = filter_shape.
Dims(1);
87 const int filter_width = filter_shape.
Dims(2);
88 const int num_muls =
output_shape.FlatSize() * filter_height * filter_width;
91 int thread_count = std::max(1, num_muls / kMinMulPerThread);
97 assert(thread_count >= 2);
100 if (batches < thread_count)
111 if (batches >= 2 * thread_count)
120 return ((batches % thread_count) == 0);
123template <
typename T,
typename TS>
125 const T *input_data,
const Shape &filter_shape,
const T *filter_data,
127 T *output_data, ruy::Context *ruy_context)
137 const auto max_threads = (ruy_context ==
nullptr) ? 1 : ruy_context->max_num_threads();
139 thread_count = std::max(1, std::min(thread_count, max_threads));
142 if (std::is_floating_point<T>::value)
144 thread_count = std::min(thread_count, 2);
150 if (thread_count == 1)
153 bias_shape, bias_data,
output_shape, output_data, 0, output_height,
158 int thread_dim, thread_dim_size;
162 thread_dim_size = output_batches;
167 thread_dim_size = output_height;
170 std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
173 tasks.reserve(thread_count);
174 int thread_start = 0;
175 for (
int i = 0; i < thread_count; ++i)
177 int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
178 tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
179 bias_data,
output_shape, output_data, thread_start, thread_end, thread_dim);
180 thread_start = thread_end;
186 const float *input_data,
const Shape &filter_shape,
const float *filter_data,
187 const Shape &bias_shape,
const float *bias_data,
float *padded_filter_data,
192 throw std::runtime_error(
"Not support different length strides");
195 throw std::runtime_error{
"Not support dilation other than 1."};
198 const int input_depth = input_shape.
Dims(3);
200 const int input_height = input_shape.
Dims(1);
201 const int input_width = input_shape.
Dims(2);
202 const int filter_height = filter_shape.
Dims(1);
203 const int filter_width = filter_shape.
Dims(2);
214 batch, input_height, input_width, input_depth, filter_height, filter_width, depth_multiplier,
215 stride, pad_height, pad_width, output_height, output_width, output_depth, input_data,
216 filter_data, padded_filter_data, pad_filter, filter_buffers_data, output_data);
218 if (bias_data !=
nullptr)
220 bias_op::biasHelper<float>(bias_shape, bias_data,
output_shape, output_data, activation_min,
int32_t DimensionsCount() const
int32_t Dims(int i) const
const luci_interpreter::RuntimeShape output_shape
void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, int thread_start, int thread_end, int thread_dim)
void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const T *filter_data, const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, T *output_data, ruy::Context *ruy_context)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
void DepthwiseConvOp(const DepthwiseConvParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, float *padded_filter_data, bool pad_filter, float *filter_buffers_data, const Shape &output_shape, float *output_data)
int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
bool MultithreadAlongBatches(int thread_count, int batches)
float float_activation_min
int16_t dilation_height_factor
int16_t dilation_width_factor
float float_activation_max
PaddingValues padding_values
DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const T *filter_data, const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, T *output_data, int thread_start, int thread_end, int thread_dim)