ONE - On-device Neural Engine
Loading...
Searching...
No Matches
DepthwiseConv.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_DEPTHWISE_CONV_H__
19#define __NNFW_CKER_DEPTHWISE_CONV_H__
20
21#include "cker/Shape.h"
22#include "cker/Types.h"
23#include "cker/Utils.h"
32#include "cker/eigen/bias_op.h"
33
34namespace nnfw
35{
36namespace cker
37{
38
39// TODO(luwa): add multithread to per-channel depthwise_conv
40// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
41// Each thread processes output elements on dim, thread_dim, in the range of
42// [thread_start, thread_end).
43// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
44// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
45template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
46{
47 DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
48 const T *input_data, const Shape &filter_shape, const T *filter_data,
49 const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
50 T *output_data, int thread_start, int thread_end, int thread_dim)
51 : params_(params), input_shape_(input_shape), input_data_(input_data),
52 filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
53 bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
54 thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
55 {
56 }
57
58 void Run() override
59 {
60 optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
61 bias_shape_, bias_data_, output_shape_, output_data_,
62 thread_start_, thread_end_, thread_dim_);
63 }
64
65private:
66 const DepthwiseConvParams &params_;
67 const Shape &input_shape_;
68 const T *input_data_;
69 const Shape &filter_shape_;
70 const T *filter_data_;
71 const Shape &bias_shape_;
72 const TS *bias_data_;
73 const Shape &output_shape_;
74 T *output_data_;
75 // const CpuFlags& cpu_flags_;
76 int thread_start_;
77 int thread_end_;
78 int thread_dim_;
79};
80
81inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
82{
83 // How many scalar multiplications are needed to make it worth using one
84 // more thread
85 static constexpr int kMinMulPerThread = 1 << 13; // 8k
86 const int filter_height = filter_shape.Dims(1);
87 const int filter_width = filter_shape.Dims(2);
88 const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
89 // Try to avoid real runtime divisions if possible by dividing by a
90 // compile-time constant.
91 int thread_count = std::max(1, num_muls / kMinMulPerThread);
92 return thread_count;
93}
94
95inline bool MultithreadAlongBatches(int thread_count, int batches)
96{
97 assert(thread_count >= 2);
98 // If there are fewer batch entries than the number of threads we want to use,
99 // then better do intra-batch-entry multithreading.
100 if (batches < thread_count)
101 {
102 return false;
103 }
104 // If there are at least 2 batch entries to be handed to each thread, then
105 // it's safe to proceed with batch-wise multithreading: each thread will have
106 // approximately equal number of batch entries to handle, so the load
107 // balancing will be reasonable, and the amount to which the load is not
108 // perfectly balanced will be offset by the inherent advantages of
109 // batch-wise multithreading (each thread is more efficient thanks to working
110 // on larger buffers with less boundary-handling overhead).
111 if (batches >= 2 * thread_count)
112 {
113 return true;
114 }
115 // In the limit case were there are at least 1 but not much more than 1
116 // batch entries per thread, it may be a good idea to do per-batch
117 // multithreading if the number of batch entries is a multiple of the number
118 // of threads, so that each thread will have the same number of batch entries
119 // to process.
120 return ((batches % thread_count) == 0);
121}
122
123template <typename T, typename TS>
124inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
125 const T *input_data, const Shape &filter_shape, const T *filter_data,
126 const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
127 T *output_data, ruy::Context *ruy_context)
128{
129 assert(input_shape.DimensionsCount() == 4);
130 assert(filter_shape.DimensionsCount() == 4);
131 assert(output_shape.DimensionsCount() == 4);
132
133 int thread_count = HowManyConvThreads(output_shape, filter_shape);
134
135 // NOTE Borrow RuyContext to get max_num_threads setting
136 // TODO Define and use max_num_threads for CPU backend
137 const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
138
139 thread_count = std::max(1, std::min(thread_count, max_threads));
140 // Cap the number of threads to 2 for float path to avoid regression in
141 // performance (b/132294857).
142 if (std::is_floating_point<T>::value)
143 {
144 thread_count = std::min(thread_count, 2);
145 }
146
147 const int output_batches = output_shape.Dims(0);
148 const int output_height = output_shape.Dims(1);
149
150 if (thread_count == 1)
151 {
152 optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
153 bias_shape, bias_data, output_shape, output_data, 0, output_height,
154 1);
155 return;
156 }
157
158 int thread_dim, thread_dim_size;
159 if (MultithreadAlongBatches(thread_count, output_batches))
160 {
161 thread_dim = 0;
162 thread_dim_size = output_batches;
163 }
164 else
165 {
166 thread_dim = 1;
167 thread_dim_size = output_height;
168 }
169
170 std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
171 // TODO(b/131746020) don't create new heap allocations every time.
172 // At least we make it a single heap allocation by using reserve().
173 tasks.reserve(thread_count);
174 int thread_start = 0;
175 for (int i = 0; i < thread_count; ++i)
176 {
177 int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
178 tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
179 bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
180 thread_start = thread_end;
181 }
182 cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
183}
184
185void DepthwiseConvOp(const DepthwiseConvParams &params, const Shape &input_shape,
186 const float *input_data, const Shape &filter_shape, const float *filter_data,
187 const Shape &bias_shape, const float *bias_data, float *padded_filter_data,
188 bool pad_filter, float *filter_buffers_data, const Shape &output_shape,
189 float *output_data)
190{
191 if (params.stride_height != params.stride_width)
192 throw std::runtime_error("Not support different length strides");
193
194 if (params.dilation_height_factor != 1 || params.dilation_width_factor != 1)
195 throw std::runtime_error{"Not support dilation other than 1."};
196
197 const int batch = MatchingDim(input_shape, 0, output_shape, 0);
198 const int input_depth = input_shape.Dims(3);
199 const int output_depth = output_shape.Dims(3);
200 const int input_height = input_shape.Dims(1);
201 const int input_width = input_shape.Dims(2);
202 const int filter_height = filter_shape.Dims(1);
203 const int filter_width = filter_shape.Dims(2);
204 const int output_height = output_shape.Dims(1);
205 const int output_width = output_shape.Dims(2);
206 const int stride = params.stride_height;
207 const int depth_multiplier = params.depth_multiplier;
208 const int pad_height = params.padding_values.height;
209 const int pad_width = params.padding_values.width;
210 const float activation_min = params.float_activation_min;
211 const float activation_max = params.float_activation_max;
212
214 batch, input_height, input_width, input_depth, filter_height, filter_width, depth_multiplier,
215 stride, pad_height, pad_width, output_height, output_width, output_depth, input_data,
216 filter_data, padded_filter_data, pad_filter, filter_buffers_data, output_data);
217
218 if (bias_data != nullptr)
219 {
220 bias_op::biasHelper<float>(bias_shape, bias_data, output_shape, output_data, activation_min,
221 activation_max);
222 }
223}
224
225} // namespace cker
226} // namespace nnfw
227
228#endif // __NNFW_CKER_DEPTHWISE_CONV_H__
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
const luci_interpreter::RuntimeShape output_shape
void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, int thread_start, int thread_end, int thread_dim)
void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const T *filter_data, const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, T *output_data, ruy::Context *ruy_context)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
void DepthwiseConvOp(const DepthwiseConvParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const float *filter_data, const Shape &bias_shape, const float *bias_data, float *padded_filter_data, bool pad_filter, float *filter_buffers_data, const Shape &output_shape, float *output_data)
int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
bool MultithreadAlongBatches(int thread_count, int batches)
Definition topk_v2.h:30
PaddingValues padding_values
Definition Types.h:234
DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape, const T *input_data, const Shape &filter_shape, const T *filter_data, const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, T *output_data, int thread_start, int thread_end, int thread_dim)