ONE/kernel_2_permute_layer_8cc_source.html

/*

 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *      http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#include "PermuteLayer.h"


#include <ruy/context.h> // from @ruy


namespace onert::backend::builtin::kernel

{


PermuteLayer::PermuteLayer(const std::vector<ITensor *> &src_tensors,

                           const std::vector<ITensor *> &dst_tensors,

                           const std::vector<ir::PermuteType> &types,

                           const std::shared_ptr<ExternalContext> &external_context)

  : _external_context{external_context}, _tasks_map{}

{

  assert(src_tensors.size() == dst_tensors.size());

  assert(src_tensors.size() == types.size());

  _src_tensors = src_tensors;

  _dst_tensors = dst_tensors;

  _permute_types = types;

  _src_tensors_offsets.resize(src_tensors.size());

  _dst_tensors_offsets.resize(dst_tensors.size());

  _permute_types.resize(src_tensors.size());

}


void PermuteLayer::optimize()

{

  // Remove copying of tensor as nullptr

  auto src_it = _src_tensors.begin();

  auto dst_it = _dst_tensors.begin();

  auto src_offsets_it = _src_tensors_offsets.begin();

  auto dst_offsets_it = _dst_tensors_offsets.begin();

  auto type_it = _permute_types.begin();

  while (src_it != _src_tensors.end())

  {

    if ((*src_it == *dst_it) || (*src_it == nullptr || *dst_it == nullptr))

    {

      src_it = _src_tensors.erase(src_it);

      dst_it = _dst_tensors.erase(dst_it);

      src_offsets_it = _src_tensors_offsets.erase(src_offsets_it);

      dst_offsets_it = _dst_tensors_offsets.erase(dst_offsets_it);

      type_it = _permute_types.erase(type_it);

    }

    else

    {

      auto src = *src_it;

      auto dst = *dst_it;

      src_offsets_it->resize(0);

      dst_offsets_it->resize(0);

      const auto permute_type = *type_it;


      src_it++;

      dst_it++;

      src_offsets_it++;

      dst_offsets_it++;

      type_it++;


      if (underlying_type(src->data_type()) != underlying_type(dst->data_type()))

        continue;


      // TODO Support different types

      auto fn = [&](backend::ITensor &src_tensor) {

        dst->access([&](backend::ITensor &dst_tensor) {

          // NOTE The buffer of both tensor can be nullptr in this step

          const auto data_size = ir::sizeOfDataType(src_tensor.data_type());


          if (permute_type == ir::PermuteType::SAME)

          {

            if ((!src_tensor.has_padding() && !dst_tensor.has_padding()))

            {

              const auto num_elements = src_tensor.getShape().num_elements();

              const int thread_count =

                _external_context->ruy_context()->max_num_threads() < static_cast<int>(num_elements)

                  ? _external_context->ruy_context()->max_num_threads()

                  : num_elements;


              std::vector<PermuteWorkerTask> tasks;

              auto start = 0;

              for (auto i = 0; i < thread_count; ++i)

              {

                int end = start + (num_elements - start) / (thread_count - i);

                tasks.emplace_back(src_tensor.buffer(), dst_tensor.buffer(), start * data_size,

                                   start * data_size, (end - start) * data_size);

                start = end;

              }

              assert(tasks.size() >= 1);

              _tasks_map[src] = std::move(tasks);

            }

            else

            {

              auto loop_shape = src_tensor.getShape();


              auto copy_axis = loop_shape.rank() - 1;

              copy_axis = copy_axis < 0 ? 1 : copy_axis;

              const auto copy_len = loop_shape.dim(copy_axis) * data_size;

              loop_shape.dim(copy_axis) = 1;


              appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);

            }

          }

          else

          {

            assert(src_tensor.getShape().rank() == 4 &&

                   (permute_type == ir::PermuteType::NHWC_TO_NCHW ||

                    permute_type == ir::PermuteType::NCHW_TO_NHWC));

            const auto loop_shape = src_tensor.getShape();

            const auto copy_len = data_size;


            appendPermuteTasks(src, dst, loop_shape, copy_len, permute_type);

          }

        });

      };

      src->access(fn);

    }

  }

}


void PermuteLayer::appendPermuteTasks(const ITensor *src_tensor, ITensor *dst_tensor,

                                      const ir::Shape &loop_shape, size_t size,

                                      const ir::PermuteType &permute_type)

{

  size_t distributed_dim = 0;

  auto src_shape = src_tensor->getShape();

  if (permute_type == ir::PermuteType::SAME)

  {

    for (int i = 1; i < src_shape.rank() - 1; ++i)

    {

      distributed_dim = src_shape.dim(distributed_dim) < src_shape.dim(i) ? i : distributed_dim;

    }

  }

  const auto distributed_dim_val = src_shape.dim(distributed_dim);

  const int thread_count =

    _external_context->ruy_context()->max_num_threads() < static_cast<int>(distributed_dim_val)

      ? _external_context->ruy_context()->max_num_threads()

      : distributed_dim_val;

  // NOTE Do not remove this assertion. It would cause performance degradation by new threads to be

  // created in the context's thread pool

  assert(thread_count <= _external_context->ruy_context()->max_num_threads());


  std::vector<PermuteWorkerTask> tasks;

  int start = 0;

  auto one_thread_loop_shape = loop_shape;

  for (auto i = 0; i < thread_count; ++i)

  {

    ir::Coordinates start_coords(one_thread_loop_shape.rank());

    start_coords.set(distributed_dim, start);

    int end = start + (distributed_dim_val - start) / (thread_count - i);

    one_thread_loop_shape.dim(distributed_dim) = end - start;

    tasks.emplace_back(*src_tensor, *dst_tensor, start_coords, one_thread_loop_shape, size,

                       permute_type);

    start = end;

  }

  assert(tasks.size() >= 1);

  _tasks_map[src_tensor] = std::move(tasks);

}


void PermuteLayer::runPermuteTasks(backend::ITensor *src, uint8_t *dst_buffer)

{

  assert(src->getShape().num_elements() * ir::sizeOfDataType(src->data_type()) <=

         src->total_size());

  std::vector<PermuteWorkerTask> &tasks = _tasks_map.at(src);

  for (size_t i = 0; i < tasks.size(); ++i)

  {

    tasks.at(i).setBuffers(src->buffer(), dst_buffer);

  }

  assert(tasks.size() >= 1);

  _external_context->ruy_context()->mutable_thread_pool()->Execute(tasks.size(), tasks.data());

}


void PermuteLayer::run()

{

  assert(_src_tensors.size() == _dst_tensors.size());

  // PermuteLayer infers dynamic shape inside itself whenever run is called for the following

  // reasons:

  // 1. PermuteLayer has to access dynamic tensor manager for input/output tensors of other backends

  // 2. Other controlflow operation(If/While) uses this layout for copying tensors of other

  // subgraphs(with other backends)

  // 3. This infering code is placed here to avoid duplicated code that can be caused by above 2

  // reasons


  // check if output is not dynamic

  for (size_t i = 0; i < _src_tensors.size(); ++i)

  {

    auto dst_tensor = _dst_tensors.at(i);

    auto src_tensor = _src_tensors.at(i);

    auto permute_type = _permute_types.at(i);

    if (src_tensor->is_dynamic() || dst_tensor->is_dynamic())

    {

      // getting output shape

      auto src_shape = src_tensor->getShape();


      // set output shape and output buffer

      ir::Shape new_shape = ir::convertShape(src_shape, permute_type);


      try

      {

        if (!dst_tensor->applyShape(new_shape))

          throw std::runtime_error{

            "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};

        assert(dst_tensor->buffer() != nullptr);

      }

      catch (const std::out_of_range &e)

      {

        std::cerr << "Error: out_of_range in PermuteLayer: output's TensorManager does not support "

                     "dynamic tensor"

                  << '\n';

        throw;

      }

    }

    assert(ir::convertShape(src_tensor->getShape(), permute_type) == dst_tensor->getShape());

  }

  assert(_src_tensors.size() == _dst_tensors.size());

  assert(_src_tensors.size() == _src_tensors_offsets.size());

  assert(_dst_tensors.size() == _dst_tensors_offsets.size());

  auto src_it = _src_tensors.begin();

  auto dst_it = _dst_tensors.begin();

  auto src_offsets_it = _src_tensors_offsets.begin();

  auto dst_offsets_it = _dst_tensors_offsets.begin();

  auto type_it = _permute_types.begin();

  while (src_it != _src_tensors.end())

  {

    auto src = *src_it;

    auto dst = *dst_it;

    auto &src_offsets = *src_offsets_it;

    auto &dst_offsets = *dst_offsets_it;

    auto permute_type = *type_it;


    if (src->total_size() == 0)

    {

      assert(dst->total_size() == 0);

    }

    else

    {

      if (src != dst)

      {

        // Conditions to run permutation with multithreading

        // 1. The tasks for multithreathing was created

        // 2. The tasks's size > 1

        // 3. Both tensors are not dynamic

        // 4. Data types of both tensors are different

        if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 ||

            src->is_dynamic() || dst->is_dynamic() ||

            underlying_type(src->data_type()) != underlying_type(dst->data_type()))

        {

          permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets, permute_type);

        }

        // If dst is subtensor, we have to use clEnqueueMapBuffer instead of clEnqueueWirteBuffer

        else if (dst->needMemoryMap() && !dst->is_subtensor())

        {

          if (!src->has_padding() && !dst->has_padding() && permute_type == ir::PermuteType::SAME)

          {

            // This is more effective than multi-threading

            src->access([&](backend::ITensor &) { dst->enqueueWriteBuffer(src->buffer(), false); });

          }

          else

          {

            // TODO Optimize this block in case of that padding size of dst is big.

            _buffers_map[dst].reserve(dst->total_size());

            auto dst_buffer = _buffers_map[dst].data();


            src->access([&](backend::ITensor &) { runPermuteTasks(src, dst_buffer); });

            dst->enqueueWriteBuffer(dst_buffer, false);

          }

        }

        else if (src->needMemoryMap() && !src->is_subtensor() && !src->has_padding() &&

                 !dst->has_padding() && permute_type == ir::PermuteType::SAME)

        {

          // This is more effective than multi-threading

          assert(!dst->needMemoryMap());

          dst->access([&](backend::ITensor &) { src->enqueueReadBuffer(dst->buffer(), true); });

        }

        else

        {

          auto fn = [&](backend::ITensor &) {

            dst->access([&](backend::ITensor &) { runPermuteTasks(src, dst->buffer()); });

          };

          src->access(fn);

        }

      }

    }

    src_it++;

    dst_it++;

    src_offsets_it++;

    dst_offsets_it++;

    type_it++;

  }

}


} // namespace onert::backend::builtin::kernel

onert::backend::ITensor
Definition ITensor.h:34

onert::backend::builtin::kernel::PermuteLayer::optimize
void optimize() override
Definition PermuteLayer.cc:40

onert::backend::builtin::kernel::PermuteLayer::PermuteLayer
PermuteLayer(const std::vector< ITensor * > &src_tensors, const std::vector< ITensor * > &dst_tensors, const std::vector< ir::PermuteType > &types, const std::shared_ptr< ExternalContext > &external_context)
Definition PermuteLayer.cc:24

onert::backend::builtin::kernel::PermuteLayer::run
void run() override
Definition PermuteLayer.cc:184

onert::exec::IPermuteFunction::_dst_tensors_offsets
std::vector< std::vector< size_t > > _dst_tensors_offsets
Definition IPermuteFunction.h:243

onert::exec::IPermuteFunction::_buffers_map
std::unordered_map< const backend::ITensor *, std::vector< uint8_t > > _buffers_map
Definition IPermuteFunction.h:245

onert::exec::IPermuteFunction::_permute_types
std::vector< ir::PermuteType > _permute_types
Definition IPermuteFunction.h:244

onert::exec::IPermuteFunction::permute
void permute(backend::ITensor *src_tensor, backend::ITensor *dst_tensor, size_t rank, std::vector< size_t > &src_offsets, std::vector< size_t > &dst_offsets, const ir::PermuteType &permute_type)
Definition IPermuteFunction.cc:238

onert::exec::IPermuteFunction::underlying_type
const std::type_info & underlying_type(ir::DataType type) const
Definition IPermuteFunction.cc:288

onert::exec::IPermuteFunction::_src_tensors_offsets
std::vector< std::vector< size_t > > _src_tensors_offsets
Definition IPermuteFunction.h:242

onert::exec::IPermuteFunction::_src_tensors
std::vector< backend::ITensor * > _src_tensors
Definition IPermuteFunction.h:240

onert::exec::IPermuteFunction::_dst_tensors
std::vector< backend::ITensor * > _dst_tensors
Definition IPermuteFunction.h:241

onert::ir::Coordinates
Class to represent position(offset) of tensor.  Assume that the front is higher dimensional....
Definition Coordinates.h:35

onert::ir::operation::Shape
Definition Shape.h:28

onert::backend::builtin::kernel
Definition CallLayer.cc:20

onert::ir::sizeOfDataType
size_t sizeOfDataType(DataType data_type)
Definition DataType.cc:27

onert::ir::convertShape
Shape convertShape(const Shape &shape, const PermuteType &type)
Converts shape when its rank is 4.
Definition Shape.cc:62

onert::ir::PermuteType
PermuteType
Definition Layout.h:36

onert::ir::PermuteType::NHWC_TO_NCHW
@ NHWC_TO_NCHW

onert::ir::PermuteType::NCHW_TO_NHWC
@ NCHW_TO_NHWC

onert::ir::PermuteType::SAME
@ SAME

size
int32_t size[5]
Definition Slice.cpp:35

src_tensor
CLTensor src_tensor
Definition Convolution.cpp:291

dst_tensor
CLTensor dst_tensor
Definition Convolution.cpp:292

PermuteLayer.h