ONE - On-device Neural Engine
Loading...
Searching...
No Matches
PermuteLayer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "PermuteLayer.h"
18
19#include <ruy/context.h> // from @ruy
20
21namespace onert
22{
23namespace backend
24{
25namespace builtin
26{
27namespace kernel
28{
29
30PermuteLayer::PermuteLayer(const std::vector<ITensor *> &src_tensors,
31 const std::vector<ITensor *> &dst_tensors,
32 const std::vector<ir::PermuteType> &types,
33 const std::shared_ptr<ExternalContext> &external_context)
34 : _external_context{external_context}, _tasks_map{}
35{
36 assert(src_tensors.size() == dst_tensors.size());
37 assert(src_tensors.size() == types.size());
38 _src_tensors = src_tensors;
39 _dst_tensors = dst_tensors;
40 _permute_types = types;
41 _src_tensors_offsets.resize(src_tensors.size());
42 _dst_tensors_offsets.resize(dst_tensors.size());
43 _permute_types.resize(src_tensors.size());
44}
45
47{
48 // Remove copying of tensor as nullptr
49 auto src_it = _src_tensors.begin();
50 auto dst_it = _dst_tensors.begin();
51 auto src_offsets_it = _src_tensors_offsets.begin();
52 auto dst_offsets_it = _dst_tensors_offsets.begin();
53 auto type_it = _permute_types.begin();
54 while (src_it != _src_tensors.end())
55 {
56 if ((*src_it == *dst_it) || (*src_it == nullptr || *dst_it == nullptr))
57 {
58 src_it = _src_tensors.erase(src_it);
59 dst_it = _dst_tensors.erase(dst_it);
60 src_offsets_it = _src_tensors_offsets.erase(src_offsets_it);
61 dst_offsets_it = _dst_tensors_offsets.erase(dst_offsets_it);
62 type_it = _permute_types.erase(type_it);
63 }
64 else
65 {
66 auto src = *src_it;
67 auto dst = *dst_it;
68 src_offsets_it->resize(0);
69 dst_offsets_it->resize(0);
70 if (underlying_type(src->data_type()) != underlying_type(dst->data_type()))
71 continue;
72 const auto permute_type = *type_it;
73
74 // TODO Support different types
75 auto fn = [&](backend::ITensor &src_tensor) {
76 dst->access([&](backend::ITensor &dst_tensor) {
77 // NOTE The buffer of both tensor can be nullptr in this step
78 const auto data_size = ir::sizeOfDataType(src_tensor.data_type());
79
80 if (permute_type == ir::PermuteType::COPY)
81 {
82 if ((!src_tensor.has_padding() && !dst_tensor.has_padding()))
83 {
84 const auto num_elements = src_tensor.getShape().num_elements();
85 const int thread_count =
86 _external_context->ruy_context()->max_num_threads() < static_cast<int>(num_elements)
87 ? _external_context->ruy_context()->max_num_threads()
88 : num_elements;
89
90 std::vector<PermuteWorkerTask> tasks;
91 auto start = 0;
92 for (auto i = 0; i < thread_count; ++i)
93 {
94 int end = start + (num_elements - start) / (thread_count - i);
95 tasks.emplace_back(src_tensor.buffer(), dst_tensor.buffer(), start * data_size,
96 start * data_size, (end - start) * data_size);
97 start = end;
98 }
99 assert(tasks.size() >= 1);
100 _tasks_map[src] = std::move(tasks);
101 }
102 else
103 {
104 auto loop_shape = src_tensor.getShape();
105
106 auto copy_axis = loop_shape.rank() - 1;
107 copy_axis = copy_axis < 0 ? 1 : copy_axis;
108 const auto copy_len = loop_shape.dim(copy_axis) * data_size;
109 loop_shape.dim(copy_axis) = 1;
110
111 appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);
112 }
113 }
114 else
115 {
116 assert(src_tensor.getShape().rank() == 4 &&
117 (permute_type == ir::PermuteType::NHWC_TO_NCHW ||
118 permute_type == ir::PermuteType::NCHW_TO_NHWC));
119 const auto loop_shape = src_tensor.getShape();
120 const auto copy_len = data_size;
121
122 appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);
123 }
124 });
125 };
126 src->access(fn);
127 src_it++;
128 dst_it++;
129 src_offsets_it++;
130 dst_offsets_it++;
131 type_it++;
132 }
133 }
134}
135
136void PermuteLayer::appendPermuteTasks(const ITensor *src_tensor, ITensor *dst_tensor,
137 const ir::Shape &loop_shape, size_t size,
138 const ir::PermuteType &permute_type)
139{
140 size_t distributed_dim = 0;
141 auto src_shape = src_tensor->getShape();
142 if (permute_type == ir::PermuteType::COPY)
143 {
144 for (int i = 1; i < src_shape.rank() - 1; ++i)
145 {
146 distributed_dim = src_shape.dim(distributed_dim) < src_shape.dim(i) ? i : distributed_dim;
147 }
148 }
149 const auto distributed_dim_val = src_shape.dim(distributed_dim);
150 const int thread_count =
151 _external_context->ruy_context()->max_num_threads() < static_cast<int>(distributed_dim_val)
152 ? _external_context->ruy_context()->max_num_threads()
153 : distributed_dim_val;
154 // NOTE Do not remove this assertion. It would cause performance degradation by new threads to be
155 // created in the context's thread pool
156 assert(thread_count <= _external_context->ruy_context()->max_num_threads());
157
158 std::vector<PermuteWorkerTask> tasks;
159 int start = 0;
160 auto one_thread_loop_shape = loop_shape;
161 for (auto i = 0; i < thread_count; ++i)
162 {
163 ir::Coordinates start_coords(one_thread_loop_shape.rank());
164 start_coords.set(distributed_dim, start);
165 int end = start + (distributed_dim_val - start) / (thread_count - i);
166 one_thread_loop_shape.dim(distributed_dim) = end - start;
167 tasks.emplace_back(*src_tensor, *dst_tensor, start_coords, one_thread_loop_shape, size,
168 permute_type);
169 start = end;
170 }
171 assert(tasks.size() >= 1);
172 _tasks_map[src_tensor] = std::move(tasks);
173}
174
175void PermuteLayer::runPermuteTasks(backend::ITensor *src, uint8_t *dst_buffer)
176{
177 assert(src->getShape().num_elements() * ir::sizeOfDataType(src->data_type()) <=
178 src->total_size());
179 std::vector<PermuteWorkerTask> &tasks = _tasks_map.at(src);
180 for (size_t i = 0; i < tasks.size(); ++i)
181 {
182 tasks.at(i).setBuffers(src->buffer(), dst_buffer);
183 }
184 assert(tasks.size() >= 1);
185 _external_context->ruy_context()->mutable_thread_pool()->Execute(tasks.size(), tasks.data());
186}
187
189{
190 assert(_src_tensors.size() == _dst_tensors.size());
191 // PermuteLayer infers dynamic shape inside itself whenever run is called for the following
192 // reasons:
193 // 1. PermuteLayer has to access dynamic tensor manager for input/output tensors of other backends
194 // 2. Other controlflow operation(If/While) uses this layout for copying tensors of other
195 // subgraphs(with other backends)
196 // 3. This infering code is placed here to avoid duplicated code that can be caused by above 2
197 // reasons
198
199 // check if output is not dynamic
200 for (size_t i = 0; i < _src_tensors.size(); ++i)
201 {
202 auto dst_tensor = _dst_tensors.at(i);
203 auto src_tensor = _src_tensors.at(i);
204 auto permute_type = _permute_types.at(i);
205 if (src_tensor->is_dynamic() || dst_tensor->is_dynamic())
206 {
207 // getting output shape
208 auto src_shape = src_tensor->getShape();
209
210 // set output shape and output buffer
211 ir::Shape new_shape = ir::convertShape(src_shape, permute_type);
212
213 try
214 {
215 if (!dst_tensor->applyShape(new_shape))
216 throw std::runtime_error{
217 "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
218 assert(dst_tensor->buffer() != nullptr);
219 }
220 catch (const std::out_of_range &e)
221 {
222 std::cerr << "Error: out_of_range in PermuteLayer: output's TensorManager does not support "
223 "dynamic tensor"
224 << '\n';
225 throw;
226 }
227 }
228 assert(ir::convertShape(src_tensor->getShape(), permute_type) == dst_tensor->getShape());
229 }
230 assert(_src_tensors.size() == _dst_tensors.size());
231 assert(_src_tensors.size() == _src_tensors_offsets.size());
232 assert(_dst_tensors.size() == _dst_tensors_offsets.size());
233 auto src_it = _src_tensors.begin();
234 auto dst_it = _dst_tensors.begin();
235 auto src_offsets_it = _src_tensors_offsets.begin();
236 auto dst_offsets_it = _dst_tensors_offsets.begin();
237 auto type_it = _permute_types.begin();
238 while (src_it != _src_tensors.end())
239 {
240 auto src = *src_it;
241 auto dst = *dst_it;
242 auto &src_offsets = *src_offsets_it;
243 auto &dst_offsets = *dst_offsets_it;
244 auto permute_type = *type_it;
245
246 if (src->total_size() == 0)
247 {
248 assert(dst->total_size() == 0);
249 }
250 else
251 {
252 if (src != dst)
253 {
254 // Conditions to run permutation with multithreading
255 // 1. The tasks for multithreathing was created
256 // 2. The tasks's size > 1
257 // 3. Both tensors are not dynamic
258 // 4. Data types of both tensors are different
259 if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 ||
260 src->is_dynamic() || dst->is_dynamic() ||
261 underlying_type(src->data_type()) != underlying_type(dst->data_type()))
262 {
263 permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets, permute_type);
264 }
265 // If dst is subtensor, we have to use clEnqueueMapBuffer instead of clEnqueueWirteBuffer
266 else if (dst->needMemoryMap() && !dst->is_subtensor())
267 {
268 if (!src->has_padding() && !dst->has_padding() && permute_type == ir::PermuteType::COPY)
269 {
270 // This is more effective than multi-threading
271 src->access([&](backend::ITensor &) { dst->enqueueWriteBuffer(src->buffer(), false); });
272 }
273 else
274 {
275 // TODO Optimize this block in case of that padding size of dst is big.
276 _buffers_map[dst].reserve(dst->total_size());
277 auto dst_buffer = _buffers_map[dst].data();
278
279 src->access([&](backend::ITensor &) { runPermuteTasks(src, dst_buffer); });
280 dst->enqueueWriteBuffer(dst_buffer, false);
281 }
282 }
283 else if (src->needMemoryMap() && !src->is_subtensor() && !src->has_padding() &&
284 !dst->has_padding() && permute_type == ir::PermuteType::COPY)
285 {
286 // This is more effective than multi-threading
287 assert(!dst->needMemoryMap());
288 dst->access([&](backend::ITensor &) { src->enqueueReadBuffer(dst->buffer(), true); });
289 }
290 else
291 {
292 auto fn = [&](backend::ITensor &) {
293 dst->access([&](backend::ITensor &) { runPermuteTasks(src, dst->buffer()); });
294 };
295 src->access(fn);
296 }
297 }
298 }
299 src_it++;
300 dst_it++;
301 src_offsets_it++;
302 dst_offsets_it++;
303 type_it++;
304 }
305}
306
307} // namespace kernel
308} // namespace builtin
309} // namespace backend
310} // namespace onert
virtual bool is_dynamic() const =0
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
virtual bool has_padding() const =0
virtual bool applyShape(const ir::Shape &)
Set the shape to shape and possibly re-allocate the buffer.
Definition ITensor.h:65
virtual uint8_t * buffer() const =0
virtual ir::Shape getShape() const =0
Get ir::Shape of tensor.
PermuteLayer(const std::vector< ITensor * > &src_tensors, const std::vector< ITensor * > &dst_tensors, const std::vector< ir::PermuteType > &types, const std::shared_ptr< ExternalContext > &external_context)
std::vector< std::vector< size_t > > _dst_tensors_offsets
std::unordered_map< const backend::ITensor *, std::vector< uint8_t > > _buffers_map
std::vector< ir::PermuteType > _permute_types
void permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, size_t rank, std::vector< size_t > &src_offsets, std::vector< size_t > &dst_offsets, const ir::PermuteType &permute_type)
const std::type_info & underlying_type(ir::DataType type) const
std::vector< std::vector< size_t > > _src_tensors_offsets
std::vector< backend::ITensor * > _src_tensors
std::vector< backend::ITensor * > _dst_tensors
Class to represent position(offset) of tensor. Assume that the front is higher dimensional....
Definition Coordinates.h:37
size_t sizeOfDataType(DataType data_type)
Definition DataType.cc:29
Shape convertShape(const Shape &shape, const PermuteType &type)
Converts shape when its rank is 4.
Definition Shape.cc:69
PermuteType
Definition Layout.h:37
int32_t size[5]
Definition Slice.cpp:35