19#include <ruy/context.h>
31 const std::vector<ITensor *> &dst_tensors,
32 const std::vector<ir::PermuteType> &types,
33 const std::shared_ptr<ExternalContext> &external_context)
34 : _external_context{external_context}, _tasks_map{}
36 assert(src_tensors.size() == dst_tensors.size());
37 assert(src_tensors.size() == types.size());
56 if ((*src_it == *dst_it) || (*src_it ==
nullptr || *dst_it ==
nullptr))
68 src_offsets_it->resize(0);
69 dst_offsets_it->resize(0);
72 const auto permute_type = *type_it;
82 if ((!src_tensor.has_padding() && !dst_tensor.
has_padding()))
84 const auto num_elements = src_tensor.getShape().num_elements();
85 const int thread_count =
86 _external_context->ruy_context()->max_num_threads() <
static_cast<int>(num_elements)
87 ? _external_context->ruy_context()->max_num_threads()
90 std::vector<PermuteWorkerTask> tasks;
92 for (
auto i = 0; i < thread_count; ++i)
94 int end = start + (num_elements - start) / (thread_count - i);
95 tasks.emplace_back(src_tensor.buffer(), dst_tensor.
buffer(), start * data_size,
96 start * data_size, (end - start) * data_size);
99 assert(tasks.size() >= 1);
100 _tasks_map[src] = std::move(tasks);
104 auto loop_shape = src_tensor.getShape();
106 auto copy_axis = loop_shape.rank() - 1;
107 copy_axis = copy_axis < 0 ? 1 : copy_axis;
108 const auto copy_len = loop_shape.dim(copy_axis) * data_size;
109 loop_shape.dim(copy_axis) = 1;
111 appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);
116 assert(src_tensor.getShape().rank() == 4 &&
119 const auto loop_shape = src_tensor.getShape();
120 const auto copy_len = data_size;
122 appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);
136void PermuteLayer::appendPermuteTasks(
const ITensor *src_tensor,
ITensor *dst_tensor,
140 size_t distributed_dim = 0;
141 auto src_shape = src_tensor->
getShape();
144 for (
int i = 1; i < src_shape.rank() - 1; ++i)
146 distributed_dim = src_shape.dim(distributed_dim) < src_shape.dim(i) ? i : distributed_dim;
149 const auto distributed_dim_val = src_shape.dim(distributed_dim);
150 const int thread_count =
151 _external_context->ruy_context()->max_num_threads() <
static_cast<int>(distributed_dim_val)
152 ? _external_context->ruy_context()->max_num_threads()
153 : distributed_dim_val;
156 assert(thread_count <= _external_context->ruy_context()->max_num_threads());
158 std::vector<PermuteWorkerTask> tasks;
160 auto one_thread_loop_shape = loop_shape;
161 for (
auto i = 0; i < thread_count; ++i)
164 start_coords.set(distributed_dim, start);
165 int end = start + (distributed_dim_val - start) / (thread_count - i);
166 one_thread_loop_shape.dim(distributed_dim) = end - start;
167 tasks.emplace_back(*src_tensor, *dst_tensor, start_coords, one_thread_loop_shape,
size,
171 assert(tasks.size() >= 1);
172 _tasks_map[src_tensor] = std::move(tasks);
175void PermuteLayer::runPermuteTasks(backend::ITensor *src, uint8_t *dst_buffer)
179 std::vector<PermuteWorkerTask> &tasks = _tasks_map.at(src);
180 for (
size_t i = 0; i < tasks.size(); ++i)
182 tasks.at(i).setBuffers(src->buffer(), dst_buffer);
184 assert(tasks.size() >= 1);
185 _external_context->ruy_context()->mutable_thread_pool()->Execute(tasks.size(), tasks.data());
208 auto src_shape = src_tensor->
getShape();
216 throw std::runtime_error{
217 "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
218 assert(dst_tensor->
buffer() !=
nullptr);
220 catch (
const std::out_of_range &e)
222 std::cerr <<
"Error: out_of_range in PermuteLayer: output's TensorManager does not support "
242 auto &src_offsets = *src_offsets_it;
243 auto &dst_offsets = *dst_offsets_it;
244 auto permute_type = *type_it;
246 if (src->total_size() == 0)
248 assert(dst->total_size() == 0);
259 if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 ||
260 src->is_dynamic() || dst->is_dynamic() ||
263 permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets, permute_type);
266 else if (dst->needMemoryMap() && !dst->is_subtensor())
271 src->access([&](
backend::ITensor &) { dst->enqueueWriteBuffer(src->buffer(),
false); });
280 dst->enqueueWriteBuffer(dst_buffer,
false);
283 else if (src->needMemoryMap() && !src->is_subtensor() && !src->has_padding() &&
287 assert(!dst->needMemoryMap());
288 dst->access([&](
backend::ITensor &) { src->enqueueReadBuffer(dst->buffer(),
true); });
293 dst->access([&](
backend::ITensor &) { runPermuteTasks(src, dst->buffer()); });
virtual bool is_dynamic() const =0
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
virtual bool has_padding() const =0
virtual bool applyShape(const ir::Shape &)
Set the shape to shape and possibly re-allocate the buffer.
virtual uint8_t * buffer() const =0
virtual ir::Shape getShape() const =0
Get ir::Shape of tensor.
PermuteLayer(const std::vector< ITensor * > &src_tensors, const std::vector< ITensor * > &dst_tensors, const std::vector< ir::PermuteType > &types, const std::shared_ptr< ExternalContext > &external_context)
std::vector< std::vector< size_t > > _dst_tensors_offsets
std::unordered_map< const backend::ITensor *, std::vector< uint8_t > > _buffers_map
std::vector< ir::PermuteType > _permute_types
void permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, size_t rank, std::vector< size_t > &src_offsets, std::vector< size_t > &dst_offsets, const ir::PermuteType &permute_type)
const std::type_info & underlying_type(ir::DataType type) const
std::vector< std::vector< size_t > > _src_tensors_offsets
std::vector< backend::ITensor * > _src_tensors
std::vector< backend::ITensor * > _dst_tensors
Class to represent position(offset) of tensor. Assume that the front is higher dimensional....
size_t sizeOfDataType(DataType data_type)
Shape convertShape(const Shape &shape, const PermuteType &type)
Converts shape when its rank is 4.