19#include <arm_compute/runtime/CL/CLFunctions.h>
28#include "ir/DataType.h"
42using ::onert::backend::acl_common::asAclFunction;
47 const ir::Graph &graph,
const std::shared_ptr<TensorBuilder> &tensor_builder,
49 : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()),
50 _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
57 auto ret = std::make_unique<exec::FunctionSequence>();
58 ret->enableDynamicShapeInferer(
false);
70 const auto block_size_index{
73 const auto NNApiInputs = 2;
77 if (!_ctx.
at(crops_index).isConstant())
79 throw std::runtime_error(
"Non-constant crops NYI for acl_cl backend BatchToSpaceND");
82 auto crops = _ctx.
at(crops_index).asVector<int32_t>();
83 for (
auto &&crop : crops)
87 throw std::runtime_error(
"Non-zero crops NYI for acl_cl backend BatchToSpaceND");
92 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
93 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
95 if (!_ctx.
at(block_size_index).data())
96 throw std::runtime_error(
"ACL CL does not support dynamic block size for BatchToSpaceND");
98 auto block = _ctx.
at(block_size_index).asVector<int32_t>();
99 int32_t height = block[0];
100 int32_t width = block[1];
102 auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
103 ifm_tensor->handle(), width, height, ofm_tensor->handle());
108void KernelGenerator::visit(
const ir::operation::BinaryArithmetic &node)
110 const auto ofm_index{node.getOutputs().at(0)};
114 const auto activation = node.param().activation;
116 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
117 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
118 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
122 std::unique_ptr<arm_compute::IFunction> fn;
123 switch (node.param().arithmetic_type)
127 arm_compute::CLArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(),
129 arm_compute::ConvertPolicy::SATURATE, act_info)
131 fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
132 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
133 arm_compute::ConvertPolicy::SATURATE, act_info);
138 arm_compute::CLArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(),
140 arm_compute::ConvertPolicy::SATURATE, act_info)
142 fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
143 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
144 arm_compute::ConvertPolicy::SATURATE, act_info);
149 arm_compute::CLPixelWiseMultiplication::validate(
150 lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0,
151 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
154 fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
155 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0,
156 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
162 arm_compute::CLArithmeticDivision::validate(lhs_tensor->info(), rhs_tensor->info(),
163 ofm_tensor->info(), act_info)
165 fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
166 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
170 assert(
false &&
"The BinaryArithmetic operation supports only binary arithmetic operations");
177void KernelGenerator::visit(
const ir::operation::Conv2D &node)
179 using ir::operation::Conv2D;
181 const auto ofm_index{node.getOutputs().at(0)};
182 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
183 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
184 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
186 const auto ifm_shape = _ctx.
at(ifm_index).shape().asFeature();
187 const auto ofm_shape = _ctx.
at(ofm_index).shape().asFeature();
189 const auto &ker_shape = _ctx.
at(ker_index).shape();
190 const auto ker_height = ker_shape.dim(1);
191 const auto ker_width = ker_shape.dim(2);
193 const auto stride = node.param().stride;
195 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
196 const auto activation = node.param().activation;
198 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
199 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
200 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
201 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
206 auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
207 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
208 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
209 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
214void KernelGenerator::visit(
const ir::operation::DepthwiseConv2D &node)
216 using ir::operation::DepthwiseConv2D;
218 const auto ofm_index{node.getOutputs().at(0)};
219 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
220 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
221 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
223 const auto ifm_shape = _ctx.
at(ifm_index).shape().asFeature();
224 const auto ofm_shape = _ctx.
at(ofm_index).shape().asFeature();
226 const auto &ker_shape = _ctx.
at(ker_index).shape();
227 const auto ker_height = ker_shape.dim(1);
228 const auto ker_width = ker_shape.dim(2);
230 const auto stride = node.param().stride;
231 const auto dilation = node.param().dilation;
234 dilation.width_factor, dilation.height_factor);
235 const auto multiplier = node.param().multiplier;
236 const auto activation = node.param().activation;
238 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
239 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
240 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
241 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
247 auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
248 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
249 conv_info, multiplier, act_info, dilation_info);
254void KernelGenerator::visit(
const ir::operation::Concat &node)
256 const auto ofm_index{node.getOutputs().at(0)};
258 std::vector<ir::OperandIndex> input_indexes;
260 for (
const auto &input : node.getInputs())
261 input_indexes.emplace_back(
input);
263 const auto axis = node.param().axis;
266 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
270 VERBOSE(acl_cl_KernelGenerator_Concat) <<
"Concat eliminated" << std::endl;
271 _return_fn = std::make_unique<exec::NopFunction>();
276 std::vector<const ::arm_compute::ICLTensor *> input_tensors;
277 for (
const auto &ifm_ind : input_indexes)
278 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
280 std::unique_ptr<::arm_compute::IFunction> fn;
281 if (input_indexes.size() < 2)
283 ::arm_compute::ICLTensor *input_tesor =
284 _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
286 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tesor,
output_tensor->handle());
290 const auto rank = _ctx.
at(ofm_index).shape().rank();
292 fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
299void KernelGenerator::visit(
const ir::operation::FullyConnected &node)
301 const auto output_index{node.getOutputs().at(0)};
302 auto output_tensor = _tensor_reg->getAclTensor(output_index);
303 const auto activation = node.param().activation;
305 throw std::runtime_error(
306 "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
310 node, _ctx, _tensor_builder, _tensor_reg);
311 _return_fn = std::make_unique<exec::FunctionSequence>(
315void KernelGenerator::visit(
const ir::operation::Reduce &node)
317 const auto output_index{node.getOutputs().at(0)};
320 const auto keep_dims{node.param().keep_dims};
321 const auto reduce_type = node.param().reduce_type;
323 auto output_tensor = _tensor_reg->getAclTensor(output_index);
324 auto input_tensor = _tensor_reg->getAclTensor(input_index);
327 const auto &axes = _ctx.at(axes_index);
328 const auto input_rank = _ctx.at(input_index).shape().rank();
330 std::unique_ptr<arm_compute::IFunction> fn;
334 fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
341 fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
342 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
349void KernelGenerator::visit(
const ir::operation::Reshape &node)
351 const auto output_index{node.getOutputs().at(0)};
354 auto output_tensor = _tensor_reg->getAclTensor(output_index);
355 auto input_tensor = _tensor_reg->getAclTensor(input_index);
357 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
363void KernelGenerator::visit(
const ir::operation::Squeeze &node)
369 const auto output_index{node.getOutputs().at(0)};
371 const auto dims{node.param().dims};
372 const auto ndim{node.param().ndim};
376 auto output_tensor = _tensor_reg->getAclTensor(output_index);
377 auto input_tensor = _tensor_reg->getAclTensor(input_index);
378 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
383void KernelGenerator::visit(
const ir::operation::Softmax &node)
385 const auto output_index{node.getOutputs().at(0)};
388 const auto beta = node.param().beta;
390 auto output_tensor = _tensor_reg->getAclTensor(output_index);
391 auto input_tensor = _tensor_reg->getAclTensor(input_index);
393 auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
394 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
400void KernelGenerator::visit(
const ir::operation::Slice &node)
402 const auto output_index{node.getOutputs().at(0)};
407 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
408 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
411 int input_rank = _ctx.at(input_index).shape().rank();
412 std::vector<int32_t> starts;
413 std::vector<int32_t> ends;
414 starts.resize(input_rank, 0);
415 ends.resize(input_rank, 0);
417 assert(_ctx.at(begins_index).data());
418 assert(_ctx.at(sizes_index).data());
419 auto beginData_base = _ctx.at(begins_index).data()->base();
420 auto sizeData_base = _ctx.at(sizes_index).data()->base();
421 [[maybe_unused]]
const int beginData_size = _ctx.at(begins_index).shape().num_elements();
422 [[maybe_unused]]
const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
426 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
427 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
428 assert(beginData_size == input_rank);
429 assert(sizeData_size == input_rank);
431 assert(beginData_base !=
nullptr);
432 for (
int n = 0; n < input_rank; ++n)
436 int32_t begin_value = *(
reinterpret_cast<const int32_t *
>(beginData_base) + n);
437 starts[axis] = begin_value;
439 int32_t size_value = *(
reinterpret_cast<const int32_t *
>(sizeData_base) + n);
440 ends[axis] = begin_value + size_value;
444 ::arm_compute::Coordinates starts_set;
445 ::arm_compute::Coordinates ends_set;
447 for (
size_t i = 0; i < starts.size(); ++i)
449 starts_set.set(i, starts[i]);
450 ends_set.set(i, ends[i]);
453 auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
454 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
459void KernelGenerator::visit(
const ir::operation::StridedSlice &node)
461 const auto output_index{node.getOutputs().at(0)};
467 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
468 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
471 int input_rank = _ctx.at(input_index).shape().rank();
472 std::vector<int32_t> starts;
473 std::vector<int32_t> ends;
474 std::vector<int32_t> strides;
475 starts.resize(input_rank, 0);
476 ends.resize(input_rank, 0);
477 strides.resize(input_rank, 0);
479 assert(_ctx.at(starts_index).data());
480 assert(_ctx.at(ends_index).data());
481 assert(_ctx.at(strides_index).data());
482 auto startData_base = _ctx.at(starts_index).data()->base();
483 auto endData_base = _ctx.at(ends_index).data()->base();
484 auto stridesData_base = _ctx.at(strides_index).data()->base();
485 [[maybe_unused]]
const int startData_size = _ctx.at(starts_index).shape().num_elements();
486 [[maybe_unused]]
const int endData_size = _ctx.at(ends_index).shape().num_elements();
487 [[maybe_unused]]
const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
491 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
492 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
493 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
494 assert(startData_size == input_rank);
495 assert(endData_size == input_rank);
496 assert(stridesData_size == input_rank);
498 assert(startData_base !=
nullptr);
499 for (
int n = 0; n < input_rank; ++n)
503 int32_t start_value = *(
reinterpret_cast<const int32_t *
>(startData_base) + n);
504 starts[axis] = start_value;
506 int32_t end_value = *(
reinterpret_cast<const int32_t *
>(endData_base) + n);
507 ends[axis] = end_value;
509 int32_t strides_value = *(
reinterpret_cast<const int32_t *
>(stridesData_base) + n);
510 strides[axis] = strides_value;
515 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
516 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
517 const auto shrink_axis_mask =
518 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
520 ::arm_compute::Coordinates starts_set;
521 ::arm_compute::Coordinates ends_set;
522 ::arm_compute::BiStrides strides_set;
524 for (
size_t i = 0; i < starts.size(); ++i)
526 starts_set.set(i, starts[i]);
527 ends_set.set(i, ends[i]);
528 strides_set.set(i, strides[i]);
532 if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
538 auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
539 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
540 begin_mask, end_mask, shrink_axis_mask);
543 if (inputData_tensor->dimension(0) == 1)
551void KernelGenerator::visit(
const ir::operation::Transpose &node)
553 const auto ofm_idx{node.getOutputs().at(0)};
557 const auto rank = _ctx.at(ifm_idx).shape().rank();
559 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
560 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
562 const auto &perms = _ctx.at(perm_idx);
563 std::vector<int32_t> pv;
564 if (perms.shape() == ir::Shape{0})
567 std::iota(pv.begin(), pv.end(), 0);
568 std::reverse(pv.begin(), pv.end());
572 pv = _ctx.at(perm_idx).asVector<int32_t>();
575 std::unique_ptr<arm_compute::IFunction> fn;
578 fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
582 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
583 fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
584 ofm_tensor->handle());
590 fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
591 ofm_tensor->handle(), backend_pv);
597void KernelGenerator::visit(
const ir::operation::ElementwiseActivation &node)
599 const auto ofm_index{node.getOutputs().at(0)};
602 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
603 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
605 const ::arm_compute::ActivationLayerInfo act_info =
608 auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
609 ifm_tensor->handle(), ofm_tensor->handle(), act_info);
614void KernelGenerator::visit(
const ir::operation::ElementwiseBinary &node)
616 const auto output_index{node.getOutputs().at(0)};
620 auto output_tensor = _tensor_reg->getAclTensor(output_index);
621 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
622 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
624 std::unique_ptr<arm_compute::IFunction> fn;
625 switch (node.param().op_type)
629 fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
630 lhs_tensor->handle(), rhs_tensor->handle(),
output_tensor->handle(),
636 fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
637 lhs_tensor->handle(), rhs_tensor->handle(),
output_tensor->handle());
642 fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
643 lhs_tensor->handle(), rhs_tensor->handle(),
output_tensor->handle());
648 fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
649 lhs_tensor->handle(), rhs_tensor->handle(),
output_tensor->handle());
654 std::string err_msg(
"acl_cl KernelGenerator : " + node.name() +
655 "is not elementwise-binary operations");
656 assert(
false && err_msg.c_str());
664void KernelGenerator::visit(
const ir::operation::ElementwiseUnary &node)
666 const auto output_index{node.getOutputs().at(0)};
669 auto output_tensor = _tensor_reg->getAclTensor(output_index);
670 auto input_tensor = _tensor_reg->getAclTensor(input_index);
672 std::unique_ptr<arm_compute::IFunction> fn;
673 switch (node.param().op_type)
677 const ::arm_compute::ActivationLayerInfo act_info{
678 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
680 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
688 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
691 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
693 fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
699 fn = acl_common::generateLayer<arm_compute::CLCast>(
700 input_tensor->handle(),
output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
706 fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
712 fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
718 fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
724 fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
730 fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
736 fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
742 const ::arm_compute::ActivationLayerInfo act_info{
743 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
745 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
751 throw std::runtime_error(
"acl_cl KernelGenerator : " + node.name() +
"is not supported yet");
761void KernelGenerator::visit(
const ir::operation::ExpandDims &node)
763 const auto output_index{node.getOutputs().at(0)};
766 auto output_tensor = _tensor_reg->getAclTensor(output_index);
767 auto input_tensor = _tensor_reg->getAclTensor(input_index);
769 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
775void KernelGenerator::visit(
const ir::operation::InstanceNorm &node)
777 const auto ofm_index{node.getOutputs().at(0)};
782 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
783 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
784 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
785 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
786 auto epsilon = node.param().epsilon;
787 auto activation = node.param().activation;
789 auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
790 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
793 _return_fn = std::make_unique<exec::FunctionSequence>(
797void KernelGenerator::visit(
const ir::operation::LSTM &node)
800 ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
803void KernelGenerator::visit(
const ir::operation::Comparison &node)
805 const auto output_index{node.getOutputs().at(0)};
809 const auto comparison_type = node.param().comparison_type;
811 auto output_tensor = _tensor_reg->getAclTensor(output_index);
812 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
813 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
815 auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
816 input0_tensor->handle(), input1_tensor->handle(),
output_tensor->handle(),
817 (arm_compute::ComparisonOperation)comparison_type);
822void KernelGenerator::visit(
const ir::operation::OneHot &node)
824 const auto output_idx{node.getOutputs().at(0)};
829 const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
833 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
834 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
836 const size_t output_rank = _ctx.at(output_idx).shape().rank();
837 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
846 std::unique_ptr<::arm_compute::IFunction> fn;
847 const auto &offvalue = _ctx.at(offvalue_idx);
848 if (offvalue.isConstant())
850 fn = acl_common::generateLayer<arm_compute::CLOneHot>(
851 indices_tensor->handle(), onvalue_tensor->handle(),
output_tensor->handle(),
856 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
857 fn = acl_common::generateLayer<arm_compute::CLOneHot>(
858 indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
859 output_tensor->handle(),
static_cast<uint32_t
>(depth), axis);
870void KernelGenerator::visit(
const ir::operation::Pack &node)
872 const auto output_index{node.getOutputs().at(0)};
873 auto axis{node.param().axis};
875 const auto output_rank = _ctx.at(output_index).shape().rank();
877 std::vector<ir::OperandIndex> input_indexes;
878 for (
const auto &input_index : node.getInputs())
881 auto output = _tensor_reg->getAclTensor(output_index)->handle();
882 std::vector<arm_compute::ICLTensor *>
inputs;
883 for (
const auto &input_index : input_indexes)
891 for (
const auto &input_index : input_indexes)
893 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
894 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
901 auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
904 for (
const auto &input_index : input_indexes)
906 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
907 if (input_tensor->dimension(0) == 1)
916void KernelGenerator::visit(
const ir::operation::Pool2D &node)
918 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
921 const auto ofm_index{node.getOutputs().at(0)};
922 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
923 const auto activation = node.param().activation;
924 _return_fn = std::make_unique<exec::FunctionSequence>(
929void KernelGenerator::visit(
const ir::operation::ResizeBilinear &node)
931 const auto ofm_index{node.getOutputs().at(0)};
934 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
935 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
937 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
938 ifm_tensor->handle(), ofm_tensor->handle(),
939 ::arm_compute::ScaleKernelInfo{
940 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
941 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
946void KernelGenerator::visit(
const ir::operation::ResizeNearestNeighbor &node)
948 const auto ofm_index{node.getOutputs().at(0)};
951 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
952 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
954 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
955 ifm_tensor->handle(), ofm_tensor->handle(),
956 ::arm_compute::ScaleKernelInfo{
957 ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
958 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
963void KernelGenerator::visit(
const ir::operation::RNN &node)
966 const auto hidden_state_out_index{
971 const auto recurrent_weights_index{
976 const auto activation = node.param().activation;
978 auto output_tensor = _tensor_reg->getAclTensor(output_index);
979 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
981 auto input_tensor = _tensor_reg->getAclTensor(input_index);
982 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
983 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
984 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
985 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
988 auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
989 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
992 auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
993 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
994 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
995 hidden_state_out_tensor->handle(),
output_tensor->handle(), act_info);
999void KernelGenerator::visit(
const ir::operation::SpaceToBatchND &node)
1001 const auto ofm_index{node.getOutputs().at(0)};
1003 const auto block_size_index{
1007 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1008 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1009 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1010 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1012 assert(_ctx.at(block_size_index).data());
1013 assert(_ctx.at(paddings_index).data());
1015 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1016 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1017 ofm_tensor->handle());
1022void KernelGenerator::visit(
const ir::operation::SpaceToDepth &node)
1024 const auto ofm_index{node.getOutputs().at(0)};
1027 auto block_size = node.param().block_size;
1029 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1030 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1032 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1033 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1038void KernelGenerator::visit(
const ir::operation::EmbeddingLookup &node)
1040 const auto output_index{node.getOutputs().at(0)};
1044 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1045 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1046 auto values_tensor = _tensor_reg->getAclTensor(values_index);
1048 auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1049 values_tensor->handle(),
output_tensor->handle(), lookups_tensor->handle());
1054void KernelGenerator::visit(
const ir::operation::L2Normalization &node)
1056 const auto ofm_index{node.getOutputs().at(0)};
1064 const auto &ifm_shape = _ctx.at(ifm_index).shape();
1066 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1068 2 * ifm_shape.dim(normalization_axis) + 1;
1073 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1074 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1076 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1077 radius, alpha, beta, bias,
false);
1079 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1080 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1085void KernelGenerator::visit(
const ir::operation::HashtableLookup &node)
1094 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1095 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1097 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1098 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1099 auto values_tensor = _tensor_reg->getAclTensor(values_index);
1101 auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1102 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1108void KernelGenerator::visit(
const ir::operation::PReLU &node)
1110 const auto ofm_index{node.getOutputs().at(0)};
1114 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1115 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1116 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1118 auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1119 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1124void KernelGenerator::visit(
const ir::operation::TransposeConv &node)
1126 const auto ofm_index{node.getOutputs().at(0)};
1130 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
1131 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
1132 const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
1134 const auto stride = node.param().stride;
1139 ker_shape.W, ker_shape.H);
1140 uint32_t invalid_horizontal = 0;
1141 uint32_t invalid_vertical = 0;
1144 invalid_horizontal =
1145 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1146 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1149 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1150 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1151 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1155 auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1156 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1157 ker_tensor->handle(),
nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1163void KernelGenerator::visit(
const ir::operation::SquaredDifference &node)
1165 const auto ofm_index{node.getOutputs().at(0)};
1169 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1170 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1171 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1173 auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1174 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1179void KernelGenerator::visit(
const ir::operation::TopKV2 &node)
1182 const auto outputIndices_index{
1188 assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1189 _ctx.at(inputData_index).shape().rank() == 2);
1191 const auto k = node.param().k;
1193 auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1194 auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1195 auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1197 auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1198 input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1203void KernelGenerator::visit(
const ir::operation::Gather &node)
1205 const auto ofm_index{node.getOutputs().at(0)};
1210 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1211 const auto axis_raw = node.param().axis;
1212 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1215 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1216 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1217 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1220 size_t n = ifm_rank;
1221 assert(n == ifm_tensor->num_dimensions());
1222 size_t k = _ctx.at(indices_index).shape().rank();
1223 assert(k == indices_tensor->num_dimensions());
1226 if (n != ifm_tensor->info()->num_dimensions())
1231 if (k != indices_tensor->info()->num_dimensions())
1237 auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1238 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1241 if (ifm_tensor->dimension(0) == 1)
1245 if (indices_tensor->dimension(0) == 1)
1253void KernelGenerator::visit(
const ir::operation::ArgMinMax &node)
1255 const auto ofm_index{node.getOutputs().at(0)};
1259 auto ifm_shape = _ctx.at(ifm_index).shape();
1260 auto ofm_shape = _ctx.at(ofm_index).shape();
1262 assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1264 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1265 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1266 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1268 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1271 axis_value += ifm_rank;
1275 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
1276 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
1277 auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
1278 ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
1283void KernelGenerator::visit(
const ir::operation::LocalResponseNormalization &node)
1285 const auto ofm_index{node.getOutputs().at(0)};
1286 const auto ifm_index{
1289 auto radius = node.param().radius;
1290 auto alpha = node.param().alpha;
1291 auto beta = node.param().beta;
1292 auto bias = node.param().bias;
1294 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1295 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1297 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1298 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias,
false);
1300 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1301 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1306void KernelGenerator::visit(
const ir::operation::DepthToSpace &node)
1308 const auto output_index{node.getOutputs().at(0)};
1311 auto block_size = node.param().block_size;
1312 assert(block_size > 0);
1314 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1315 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1317 auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1318 input_tensor->handle(),
output_tensor->handle(), block_size);
1323void KernelGenerator::visit(
const ir::operation::Split &node)
1328 assert(node.param().num_splits ==
static_cast<int>(node.getOutputs().size()));
1329 if (!_ctx.at(axis_index).isConstant())
1331 throw std::runtime_error(
"Non-constant axis_index NYI for acl_cl backend");
1334 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1335 std::vector<ir::OperandIndex> output_indexes;
1336 for (
const auto &output : node.getOutputs())
1337 output_indexes.emplace_back(
output);
1339 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1340 std::vector<arm_compute::ICLTensor *> output_tensors;
1341 for (
const auto &ofm_ind : output_indexes)
1342 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1344 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1350 acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1355void KernelGenerator::visit(
const ir::operation::SplitV &node)
1361 assert(node.param().num_splits ==
static_cast<int>(node.getOutputs().size()));
1363 const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1364 std::vector<ir::OperandIndex> output_indexes;
1365 for (
const auto &output : node.getOutputs())
1366 output_indexes.emplace_back(
output);
1368 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1369 auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1371 std::vector<arm_compute::ICLTensor *> output_tensors;
1372 for (
const auto &ofm_ind : output_indexes)
1373 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1375 auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1376 const auto &split_dim_op = _ctx.at(split_dim_index);
1377 if (split_dim_op.isConstant())
1379 int32_t split_dim = split_dim_op.asScalar<int32_t>();
1380 uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1382 if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1389 fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1390 output_tensors, node.param().num_splits);
1392 if (ifm_tensor->dimension(0) == 1)
1399 throw std::runtime_error(
"Non-constant split_dim NYI for acl_cl backend");
1405void KernelGenerator::visit(
const ir::operation::Unpack &node)
1408 auto axis{node.param().axis};
1410 const auto input_rank = _ctx.at(input_index).shape().rank();
1412 std::vector<ir::OperandIndex> output_indexes;
1413 for (
const auto &output_index : node.getOutputs())
1414 output_indexes.emplace_back(output_index);
1416 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1417 std::vector<arm_compute::ICLTensor *> outputs;
1418 for (
const auto &output_index : output_indexes)
1419 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1426 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1433 acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1436 if (input_tensor->dimension(0) == 1)
1444void KernelGenerator::visit(
const ir::operation::Pad &node)
1448 const auto output_index{node.getOutputs().at(0)};
1449 assert(_ctx.at(pad_index).data());
1451 auto rank = _ctx.at(input_index).shape().rank();
1452 auto pad_base = _ctx.at(pad_index).data()->base();
1454 auto input_type = _ctx.at(input_index).typeInfo();
1456 auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point());
1457 const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1459 auto input = _tensor_reg->getAclTensor(input_index)->handle();
1460 auto output = _tensor_reg->getAclTensor(output_index)->handle();
1462 ::arm_compute::PaddingList padding_list;
1463 padding_list.resize(rank);
1464 for (int32_t n = 0; n < rank; ++n)
1466 const int32_t *from =
reinterpret_cast<const int32_t *
>(pad_base) + (n * 2);
1469 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1473 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1474 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1481 acl_common::generateLayer<arm_compute::CLPadLayerEx>(input, output, padding_list, pixel_value);
1489 if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1497void KernelGenerator::visit(
const ir::operation::ConvertFp32ToFp16 &node)
1499 const auto ofm_index{node.getOutputs().at(0)};
1502 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1503 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1505 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1506 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1511void KernelGenerator::visit(
const ir::operation::ConvertFp16ToFp32 &node)
1513 const auto ofm_index{node.getOutputs().at(0)};
1516 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1517 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1519 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1520 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1525void KernelGenerator::visit(
const ir::operation::Reverse &node)
1527 const auto ofm_index{node.getOutputs().at(0)};
1531 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1532 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1533 auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1537 if (_ctx.at(axis_index).isConstant() &&
1538 (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1540 axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1543 auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1544 ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle(),
false);
This file defines NopFunction.
Class to run FullyConnected Layer after reshaping input tensor.
std::unique_ptr< exec::FunctionSequence > generate(ir::OperationIndex ind) override
KernelGenerator(const ir::Graph &graph, const std::shared_ptr< TensorBuilder > &tensor_builder, const std::shared_ptr< acl_common::AclTensorRegistry< TensorManager > > &_tensor_reg)
uint32_t value(void) const
static std::unique_ptr< exec::IFunction > generate(ir::Activation code, T_Tensor *ifm_alloc)
Tensor registry class for acl backends.
std::unique_ptr< exec::IFunction > _return_fn
std::unique_ptr< exec::IFunction > releaseFunction()
const Operations & operations() const override
const OperandIndex & at(IOIndex set_index) const
const OperandIndexSequence & getOutputs() const override
OperandIndexSequence & getInputs()
const Object & at(const Index &index) const
Get the object that is associated with the given index.
#define VERBOSE(name, lv)
arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir)
ARMComputeAxis ToARMComputeAxis(uint32_t rank, uint32_t axis)
inline ::arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank, const std::vector< int32_t > runtime_pv)
std::unique_ptr< exec::IFunction > kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
std::set< uint32_t > asSet(const ir::Operand &operand, int32_t rank)
::arm_compute::ActivationLayerInfo asActivationLayerInfo(const ir::Activation act_code)
arm_compute::ReductionOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir)
arm_compute::Coordinates asCoordinates(const ir::Operand &operand, int32_t rank)
void enableDimCorrection(IACLTensor *tensor)
arm_compute::PixelValue asPixelValue(const ir::Operand &operand)
arm_compute::Size2D asDilation(uint32_t dilation_width, uint32_t dilation_height)
::arm_compute::PadStrideInfo asPadStrideInfo(const ir::ExplicitPadding &padding, const ir::Stride &stride)
std::unique_ptr< AclFunction > asAclFunction(std::unique_ptr<::arm_compute::IFunction > &&layer)
std::unique_ptr< exec::IFunction > kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, const std::shared_ptr< T_TensorBuilder > &tensor_builder, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
::arm_compute::DataType asDataType(const ir::DataType type)
void disableDimCorrection(IACLTensor *tensor)
const ExplicitPadding calculatePadding(const Padding &padding, const FeatureShape &ifm_shape, const FeatureShape &ofm_shape, const Stride &stride, uint32_t kw, uint32_t kh, uint32_t dwf=1, uint32_t dhf=1)