ONE - On-device Neural Engine
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
KernelGenerator.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "KernelGenerator.h"
18
19#include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20#include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
21
23#include <AclFunction.h>
24#include <Convert.h>
25#include <Swizzle.h>
26
27#include "ir/Index.h"
28#include "ir/DataType.h"
29#include "ir/InternalType.h"
30#include "exec/NopFunction.h"
31#include "util/logging.h"
32#include "AclKernelGen.h"
33
35{
36
37using ::onert::backend::acl_common::asAclFunction;
39 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
40
42 const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
43 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
44 : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()),
45 _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
46{
47 // DO NOTHING
48}
49
50std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
51{
52 auto ret = std::make_unique<exec::FunctionSequence>();
53 ret->enableDynamicShapeInferer(false);
54
55 const auto &op = _graph.operations().at(ind);
56 op.accept(*this);
57 ret->append(releaseFunction());
58 return ret;
59}
60
61void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
62{
63 const auto ofm_index{node.getOutputs().at(0)};
64 const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
65 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
66
67 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
68
69 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
70 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
71
72 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
73 if (axis_value < 0)
74 {
75 axis_value += ifm_rank;
76 }
77 assert(axis_value >= 0 && axis_value < ifm_rank);
78 const auto fixed_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
79 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
80 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
81
82 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
83 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
84
85 _return_fn = asAclFunction(std::move(fn));
86}
87
88void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
89{
90 const auto ofm_index{node.getOutputs().at(0)};
91 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
92 const auto block_size_index{
94
95 const auto NNApiInputs = 2;
96 if (node.getInputs().size() != NNApiInputs)
97 {
98 const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
99 if (!_ctx.at(crops_index).isConstant())
100 {
101 throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
102 }
103
104 auto crops = _ctx.at(crops_index).asVector<int32_t>();
105 for (auto &&crop : crops)
106 {
107 if (crop != 0)
108 {
109 throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
110 }
111 }
112 }
113
114 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
115 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
116
117 if (!_ctx.at(block_size_index).data())
118 throw std::runtime_error("ACL NEON does not support dynamic block size for BatchToSpaceND");
119
120 auto block = _ctx.at(block_size_index).asVector<int32_t>();
121 int32_t height = block[0];
122 int32_t width = block[1];
123
124 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
125 ifm_tensor->handle(), width, height, ofm_tensor->handle());
126
127 _return_fn = asAclFunction(std::move(fn));
128}
129
130void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
131{
132 const auto ofm_index{node.getOutputs().at(0)};
133 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
134 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
135
136 const auto activation = node.param().activation;
137
138 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
139 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
140 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
141
142 std::unique_ptr<arm_compute::IFunction> fn;
143 switch (node.param().arithmetic_type)
144 {
146 {
147 arm_compute::NEArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(),
148 ofm_tensor->info(),
149 arm_compute::ConvertPolicy::SATURATE)
150 .throw_if_error();
151 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
152 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
153 arm_compute::ConvertPolicy::SATURATE);
154 break;
155 }
157 {
158 arm_compute::NEArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(),
159 ofm_tensor->info(),
160 arm_compute::ConvertPolicy::SATURATE)
161 .throw_if_error();
162 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
163 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
164 arm_compute::ConvertPolicy::SATURATE);
165 break;
166 }
168 {
169 arm_compute::NEPixelWiseMultiplication::validate(
170 lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0,
171 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO)
172 .throw_if_error();
173 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
174 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
175 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
176 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
177 break;
178 }
180 {
181 arm_compute::NEElementwiseDivision::validate(lhs_tensor->info(), rhs_tensor->info(),
182 ofm_tensor->info())
183 .throw_if_error();
184 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
185 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
186 break;
187 }
188 default:
189 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
190 break;
191 }
192 _return_fn = std::make_unique<exec::FunctionSequence>(
193 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
194}
195
196void KernelGenerator::visit(const ir::operation::Conv2D &node)
197{
198 using ir::operation::Conv2D;
199
200 const auto ofm_index{node.getOutputs().at(0)};
201 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
202 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
203 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
204
205 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
206 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
207 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
208 const auto &ker_shape = _ctx.at(ker_index).shape();
209 const auto ker_height = ker_shape.dim(1);
210 const auto ker_width = ker_shape.dim(2);
211
212 const auto stride = node.param().stride;
213 const auto padding =
214 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
215 const auto activation = node.param().activation;
216
217 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
218 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
219 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
220 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
221
222 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
223 const auto act_info = acl_common::asActivationLayerInfo(activation);
224
225 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
226 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
227 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
228 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
229
230 _return_fn = asAclFunction(std::move(fn));
231}
232
233void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
234{
235 const auto output_index{node.getOutputs().at(0)};
236 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
237
238 auto block_size = node.param().block_size;
239 assert(block_size > 0);
240
241 auto output_tensor = _tensor_reg->getAclTensor(output_index);
242 auto input_tensor = _tensor_reg->getAclTensor(input_index);
243
244 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
245 input_tensor->handle(), output_tensor->handle(), block_size);
246
247 _return_fn = asAclFunction(std::move(fn));
248}
249
250void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
251{
252 using ir::operation::DepthwiseConv2D;
253
254 const auto ofm_index{node.getOutputs().at(0)};
255 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
256 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
257 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
258
259 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
260 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
261 // Kernel format is [1, kernel_height, kernel_width, depth_out].
262 const auto &ker_shape = _ctx.at(ker_index).shape();
263 const auto ker_height = ker_shape.dim(1);
264 const auto ker_width = ker_shape.dim(2);
265
266 const auto stride = node.param().stride;
267 const auto dilation = node.param().dilation;
268 const auto padding =
269 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
270 dilation.width_factor, dilation.height_factor);
271 const auto multiplier = node.param().multiplier;
272 const auto activation = node.param().activation;
273
274 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
275 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
276 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
277 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
278
279 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
280 const auto act_info = acl_common::asActivationLayerInfo(activation);
281 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
282
283 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
284 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
285 conv_info, multiplier, act_info, dilation_info);
286
287 _return_fn = asAclFunction(std::move(fn));
288}
289
290void KernelGenerator::visit(const ir::operation::Concat &node)
291{
292 const auto ofm_index{node.getOutputs().at(0)};
293
294 std::vector<ir::OperandIndex> input_indexes;
295 for (const auto &input : node.getInputs())
296 input_indexes.emplace_back(input);
297
298 const auto axis = node.param().axis;
299
300 // Concat elimination check
301 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
302 if (eliminated)
303 {
304 // If concat eliminated, return a NOP IFunction
305 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
306 _return_fn = std::make_unique<exec::NopFunction>();
307 return;
308 }
309
310 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
311 std::vector<const ::arm_compute::ITensor *> input_tensors;
312 for (const auto &ifm_ind : input_indexes)
313 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
314
315 std::unique_ptr<::arm_compute::IFunction> fn;
316 if (input_indexes.size() < 2)
317 {
318 ::arm_compute::ITensor *input_tesor = _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
319 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tesor, output_tensor->handle());
320 }
321 else
322 {
323 const auto rank = _ctx.at(ofm_index).shape().rank();
324 const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis).value();
325 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
326 input_tensors, output_tensor->handle(), fixed_axis);
327 }
328
329 _return_fn = asAclFunction(std::move(fn));
330}
331
332void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
333{
334 const auto ofm_index{node.getOutputs().at(0)};
335 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
336
337 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
338 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
339
340 const ::arm_compute::ActivationLayerInfo act_info =
341 acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
342
343 std::unique_ptr<arm_compute::IFunction> fn =
344 acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
345 ofm_tensor->handle(), act_info);
346
347 _return_fn = asAclFunction(std::move(fn));
348}
349
350void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
351{
352 const auto output_index{node.getOutputs().at(0)};
353 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
354 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
355
356 auto output_tensor = _tensor_reg->getAclTensor(output_index);
357 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
358 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
359
360 std::unique_ptr<arm_compute::IFunction> fn;
361 switch (node.param().op_type)
362 {
364 {
365 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
366 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
367 break;
368 }
370 {
371 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
372 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
373 break;
374 }
376 {
377 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
378 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
379 break;
380 }
382 {
383 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
384 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
385 break;
386 }
387 default:
388 {
389 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
390 "is not elementwise-binary operations");
391 assert(false && err_msg.c_str());
392 break;
393 }
394 }
395 _return_fn = asAclFunction(std::move(fn));
396}
397
398void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
399{
400 const auto output_index{node.getOutputs().at(0)};
401 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
402
403 auto output_tensor = _tensor_reg->getAclTensor(output_index);
404 auto input_tensor = _tensor_reg->getAclTensor(input_index);
405
406 std::unique_ptr<arm_compute::IFunction> fn;
407 switch (node.param().op_type)
408 {
410 {
411 const ::arm_compute::ActivationLayerInfo act_info{
412 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
413
414 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
415 input_tensor->handle(), output_tensor->handle(), act_info);
416 break;
417 }
419 {
420 if (input_tensor->data_type() == output_tensor->data_type())
421 {
422 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
423 output_tensor->handle());
424 }
425 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
426 {
427 fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
428 output_tensor->handle());
429 }
430 else
431 {
432 fn = acl_common::generateLayer<arm_compute::NECast>(
433 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
434 }
435 break;
436 }
438 {
439 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
440 output_tensor->handle());
441 break;
442 }
444 {
445 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
446 output_tensor->handle());
447 break;
448 }
450 {
451 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
452 output_tensor->handle());
453 break;
454 }
456 {
457 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
458 output_tensor->handle());
459 break;
460 }
462 {
463 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
464 output_tensor->handle());
465 break;
466 }
468 {
469 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
470 output_tensor->handle());
471 break;
472 }
474 {
475 const ::arm_compute::ActivationLayerInfo act_info{
476 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
477
478 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
479 input_tensor->handle(), output_tensor->handle(), act_info);
480 break;
481 }
482 default:
483 {
484 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
485 "is not supported yet");
486 break;
487 }
488 }
489 _return_fn = asAclFunction(std::move(fn));
490}
491
492void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
493{
494 const auto output_index{node.getOutputs().at(0)};
495 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
496 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
497
498 auto output_tensor = _tensor_reg->getAclTensor(output_index);
499 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
500 auto values_tensor = _tensor_reg->getAclTensor(values_index);
501
502 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
503 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
504
505 _return_fn = asAclFunction(std::move(fn));
506}
507
508void KernelGenerator::visit(const ir::operation::FullyConnected &node)
509{
510 const auto output_index{node.getOutputs().at(0)};
511 auto output_tensor = _tensor_reg->getAclTensor(output_index);
512 const auto activation = node.param().activation;
513 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
514 throw std::runtime_error(
515 "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
516
517 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
519 node, _ctx, _tensor_builder, _tensor_reg);
520 _return_fn = std::make_unique<exec::FunctionSequence>(
521 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
522}
523
524void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
525{
526 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
527 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
528
529 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
530 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
531 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
532
533 auto output_tensor = _tensor_reg->getAclTensor(output_index);
534 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
535
536 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
537 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
538 auto values_tensor = _tensor_reg->getAclTensor(values_index);
539
540 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
541 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
542 output_tensor->handle(), hits_tensor->handle());
543
544 _return_fn = asAclFunction(std::move(fn));
545}
546
547void KernelGenerator::visit(const ir::operation::Gather &node)
548{
549 const auto ofm_index{node.getOutputs().at(0)};
550
551 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
552 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
553
554 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
555 const auto axis_raw = node.param().axis;
556 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
557 // Converting in reverse order
558 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
559
560 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
561 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
562 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
563
564 // input is n-D, indices k-D, output is (n + k - 1)-D
565 size_t n = ifm_rank;
566 assert(n == ifm_tensor->num_dimensions());
567 size_t k = _ctx.at(indices_index).shape().rank();
568 assert(k == indices_tensor->num_dimensions());
569
570 // Disable applied dim_correction
571 if (n != ifm_tensor->info()->num_dimensions())
572 {
573 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
575 }
576 if (k != indices_tensor->info()->num_dimensions())
577 {
578 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
579 acl_common::disableDimCorrection(indices_tensor);
580 }
581
582 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
583 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
584
585 // Revert disabling applied dim_correction
586 if (ifm_tensor->dimension(0) == 1)
587 {
589 }
590 if (indices_tensor->dimension(0) == 1)
591 {
592 acl_common::enableDimCorrection(indices_tensor);
593 }
594
595 _return_fn = asAclFunction(std::move(fn));
596}
597
598void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
599{
600 const auto ofm_index{node.getOutputs().at(0)};
601 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
602 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
603 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
604
605 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
606 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
607 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
608 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
609 auto epsilon = node.param().epsilon;
610 auto activation = node.param().activation;
611
612 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
613 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
614 epsilon);
615
616 _return_fn = std::make_unique<exec::FunctionSequence>(
617 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
618}
619
620void KernelGenerator::visit(const ir::operation::L2Normalization &node)
621{
622 const auto ofm_index{node.getOutputs().at(0)};
623 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
624
625 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
626 // L2 Normalization always performs the reduction along the depth axis
627 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
628 // choosing normalization parameters as below
629
630 const auto &ifm_shape = _ctx.at(ifm_index).shape();
631 // TODO Support optional constant dimension that normalization would be performed on
632 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
633 int32_t radius =
634 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
635 float alpha = 1.0f; // In the implementation to make alpha_ become 1
636 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
637 float bias = 0.0f; // Don't offset the reduction.
638
639 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
640 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
641
642 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
643 radius, alpha, beta, bias, false);
644
645 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
646 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
647
648 _return_fn = asAclFunction(std::move(fn));
649}
650
651void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
652{
653 const auto ofm_index{node.getOutputs().at(0)};
654 const auto ifm_index{
656
657 auto radius = node.param().radius;
658 auto alpha = node.param().alpha;
659 auto beta = node.param().beta;
660 auto bias = node.param().bias;
661
662 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
663 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
664
665 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
666 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
667
668 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
669 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
670
671 _return_fn = asAclFunction(std::move(fn));
672}
673
674void KernelGenerator::visit(const ir::operation::LSTM &node)
675{
676 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
677 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
678}
679
680void KernelGenerator::visit(const ir::operation::Pack &node)
681{
682 const auto output_index{node.getOutputs().at(0)};
683 auto axis{node.param().axis};
684
685 const auto output_rank = _ctx.at(output_index).shape().rank();
686
687 std::vector<ir::OperandIndex> input_indexes;
688 for (const auto &input_index : node.getInputs())
689 input_indexes.emplace_back(input_index);
690
691 auto output = _tensor_reg->getAclTensor(output_index)->handle();
692 std::vector<arm_compute::ITensor *> inputs;
693 for (const auto &input_index : input_indexes)
694 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
695
696 if (axis < 0)
697 axis += output_rank;
698 axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
699
700 // Disable applied dim_correction
701 for (const auto &input_index : input_indexes)
702 {
703 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
704 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
705 {
706 // This means that high dimension's value is 1 and input tensor is applied dim_correction
708 }
709 }
710
711 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
712
713 // Revert disabling applied dim_correction
714 for (const auto &input_index : input_indexes)
715 {
716 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
717 if (input_tensor->dimension(0) == 1)
718 {
720 }
721 }
722
723 _return_fn = asAclFunction(std::move(fn));
724}
725
726void KernelGenerator::visit(const ir::operation::Pad &node)
727{
728 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
729 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
730 const auto output_index{node.getOutputs().at(0)};
731 assert(_ctx.at(pad_index).data());
732
733 auto rank = _ctx.at(input_index).shape().rank();
734 auto pad_base = _ctx.at(pad_index).data()->base();
735
736 auto input = _tensor_reg->getAclTensor(input_index)->handle();
737 auto output = _tensor_reg->getAclTensor(output_index)->handle();
738
739 ::arm_compute::PaddingList padding_list;
740 padding_list.resize(rank);
741 for (int32_t n = 0; n < rank; ++n)
742 {
743 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
744
745 const auto axis = acl_common::ToARMComputeAxis(rank, n).value();
746 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
747 }
748
749 [[maybe_unused]] const auto input_type = _ctx.at(input_index).typeInfo();
750 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
751 assert(input->info()->quantization_info() ==
752 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point()));
753 const auto pixel_value =
754 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
755
756 auto fn =
757 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
758
759 _return_fn = asAclFunction(std::move(fn));
760}
761
762void KernelGenerator::visit(const ir::operation::Pool2D &node)
763{
764 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
765 node, _ctx, _tensor_reg, acl_common::convertPoolType(node.param().op_type));
766
767 const auto ofm_index{node.getOutputs().at(0)};
768 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
769 const auto activation = node.param().activation;
770 _return_fn = std::make_unique<exec::FunctionSequence>(
771 asAclFunction(std::move(raw_fn)),
772 ActivationBuilder::generate(activation, ofm_tensor->handle()));
773}
774
775void KernelGenerator::visit(const ir::operation::PReLU &node)
776{
777 const auto ofm_index{node.getOutputs().at(0)};
778 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
779 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
780
781 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
782 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
783 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
784
785 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
786 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
787
788 _return_fn = asAclFunction(std::move(fn));
789}
790
791void KernelGenerator::visit(const ir::operation::Reduce &node)
792{
793 const auto output_index{node.getOutputs().at(0)};
794 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
795 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
796
797 auto output_tensor = _tensor_reg->getAclTensor(output_index);
798 auto input_tensor = _tensor_reg->getAclTensor(input_index);
799
800 // Convert to ACL axes taking into account negative values and possible duplicates.
801 const auto &axes = _ctx.at(axes_index);
802 const auto input_rank = _ctx.at(input_index).shape().rank();
803 const auto reduce_axes = acl_common::asCoordinates(axes, input_rank);
804 const auto reduce_type = node.param().reduce_type;
805 const auto keep_dims = node.param().keep_dims;
806
807 std::unique_ptr<::arm_compute::IFunction> fn;
809 {
810 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
811 keep_dims, output_tensor->handle());
812 }
813 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
814 {
815 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
816 keep_dims, output_tensor->handle());
817 }
818 else
819 {
820 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
821 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
822 acl_common::convertReduceType(reduce_type));
823 }
824 _return_fn = asAclFunction(std::move(fn));
825}
826
827void KernelGenerator::visit(const ir::operation::Reshape &node)
828{
829 const auto output_index{node.getOutputs().at(0)};
830 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
831
832 auto output_tensor = _tensor_reg->getAclTensor(output_index);
833 auto input_tensor = _tensor_reg->getAclTensor(input_index);
834
835 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
836 output_tensor->handle());
837
838 _return_fn = asAclFunction(std::move(fn));
839}
840
841void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
842{
843 const auto ofm_index{node.getOutputs().at(0)};
844 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
845
846 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
847 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
848
849 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
850 ifm_tensor->handle(), ofm_tensor->handle(),
851 ::arm_compute::ScaleKernelInfo{::arm_compute::InterpolationPolicy::BILINEAR,
852 ::arm_compute::BorderMode::REPLICATE,
853 ::arm_compute::PixelValue(0.f),
854 ::arm_compute::SamplingPolicy::TOP_LEFT, false /*use padding*/});
855
856 _return_fn = asAclFunction(std::move(fn));
857}
858
859void KernelGenerator::visit(const ir::operation::RNN &node)
860{
861 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
862 const auto hidden_state_out_index{
864
865 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
866 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
867 const auto recurrent_weights_index{
869 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
870 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
871
872 const auto activation = node.param().activation;
873
874 auto output_tensor = _tensor_reg->getAclTensor(output_index);
875 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
876
877 auto input_tensor = _tensor_reg->getAclTensor(input_index);
878 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
879 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
880 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
881 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
882 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
883
884 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
885 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
886 _return_fn = asAclFunction(std::move(copy_layer));
887
888 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
889 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
890 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
891 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
892 _return_fn = asAclFunction(std::move(fn));
893}
894
895void KernelGenerator::visit(const ir::operation::Squeeze &node)
896{
897 // Squeeze is identical to reshape except that it has an optional dimensions input.
898 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
899 // by freezer and toco
900 const auto output_index{node.getOutputs().at(0)};
901 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
902 const auto dims{node.param().dims};
903 const auto ndim{node.param().ndim};
904 (void)dims;
905 (void)ndim;
906
907 auto output_tensor = _tensor_reg->getAclTensor(output_index);
908 auto input_tensor = _tensor_reg->getAclTensor(input_index);
909 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
910 output_tensor->handle());
911 _return_fn = asAclFunction(std::move(fn));
912}
913
914void KernelGenerator::visit(const ir::operation::Softmax &node)
915{
916 const auto output_index{node.getOutputs().at(0)};
917 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
918 const auto beta = node.param().beta;
919
920 auto output_tensor = _tensor_reg->getAclTensor(output_index);
921 auto input_tensor = _tensor_reg->getAclTensor(input_index);
922
923 // NOTE NESoftmaxLayer's default axis is -1
924 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
925 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
926 output_tensor->handle(), beta);
927
928 _return_fn = asAclFunction(std::move(fn));
929}
930
931void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
932{
933 const auto ofm_index{node.getOutputs().at(0)};
934 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
935 const auto block_size_index{
937 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
938
939 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
940 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
941 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
942 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
943
944 assert(_ctx.at(block_size_index).data());
945 assert(_ctx.at(paddings_index).data());
946
947 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
948 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
949 ofm_tensor->handle());
950
951 _return_fn = asAclFunction(std::move(fn));
952}
953
954void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
955{
956 const auto ofm_index{node.getOutputs().at(0)};
957 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
958
959 auto block_size = node.param().block_size;
960
961 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
962 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
963
964 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
965 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
966
967 _return_fn = asAclFunction(std::move(fn));
968}
969
970void KernelGenerator::visit(const ir::operation::Split &node)
971{
972 // TODO Support this op by SubTensor
973 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
974 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
975
976 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
977 if (!_ctx.at(axis_index).isConstant())
978 {
979 throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
980 }
981
982 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
983 std::vector<ir::OperandIndex> output_indexes;
984 for (const auto &output : node.getOutputs())
985 output_indexes.emplace_back(output);
986
987 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
988 std::vector<arm_compute::ITensor *> output_tensors;
989 for (const auto &ofm_ind : output_indexes)
990 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
991
992 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
993 if (axis < 0)
994 axis += ifm_rank;
995 axis = acl_common::ToARMComputeAxis(ifm_rank, axis).value();
996
997 auto fn =
998 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
999
1000 _return_fn = asAclFunction(std::move(fn));
1001}
1002
1003void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1004{
1005 const auto ofm_index{node.getOutputs().at(0)};
1006 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1007 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1008
1009 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1010 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1011 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1012
1013 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1014 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1015
1016 _return_fn = asAclFunction(std::move(fn));
1017}
1018
1019void KernelGenerator::visit(const ir::operation::Slice &node)
1020{
1021 const auto output_index{node.getOutputs().at(0)};
1022 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1023 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1024 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1025
1026 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1027 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1028
1029 // Set initializers for indices data such as order of inputData
1030 int input_rank = _ctx.at(input_index).shape().rank();
1031 std::vector<int32_t> starts;
1032 std::vector<int32_t> ends;
1033 starts.resize(input_rank, 0);
1034 ends.resize(input_rank, 0);
1035 {
1036 auto beginData_base = _ctx.at(begins_index).data()->base();
1037 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1038 [[maybe_unused]] const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1039 [[maybe_unused]] const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1040
1041 using ir::DataType;
1042
1043 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1044 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1045 assert(beginData_size == input_rank);
1046 assert(sizeData_size == input_rank);
1047
1048 assert(beginData_base != nullptr);
1049 for (int n = 0; n < input_rank; ++n)
1050 {
1051 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
1052
1053 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1054 starts[axis] = begin_value;
1055
1056 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1057 ends[axis] = begin_value + size_value;
1058 }
1059 }
1060
1061 ::arm_compute::Coordinates starts_set;
1062 ::arm_compute::Coordinates ends_set;
1063
1064 for (size_t i = 0; i < starts.size(); ++i)
1065 {
1066 starts_set.set(i, starts[i]);
1067 ends_set.set(i, ends[i]);
1068 }
1069
1070 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1071 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1072
1073 _return_fn = asAclFunction(std::move(fn));
1074}
1075
1076void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1077{
1078 const auto output_index{node.getOutputs().at(0)};
1079 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1080 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1081 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1082 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1083
1084 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1085 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1086
1087 // Set initializers for indices data such as order of inputData
1088 int input_rank = _ctx.at(input_index).shape().rank();
1089 std::vector<int32_t> starts;
1090 std::vector<int32_t> ends;
1091 std::vector<int32_t> strides;
1092 starts.resize(input_rank, 0);
1093 ends.resize(input_rank, 0);
1094 strides.resize(input_rank, 0);
1095 {
1096 auto startData_base = _ctx.at(starts_index).data()->base();
1097 auto endData_base = _ctx.at(ends_index).data()->base();
1098 auto stridesData_base = _ctx.at(strides_index).data()->base();
1099 [[maybe_unused]] const int startData_size = _ctx.at(starts_index).shape().num_elements();
1100 [[maybe_unused]] const int endData_size = _ctx.at(ends_index).shape().num_elements();
1101 [[maybe_unused]] const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1102
1103 using ir::DataType;
1104
1105 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1106 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1107 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1108 assert(startData_size == input_rank);
1109 assert(endData_size == input_rank);
1110 assert(stridesData_size == input_rank);
1111
1112 assert(startData_base != nullptr);
1113 for (int n = 0; n < input_rank; ++n)
1114 {
1115 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
1116
1117 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1118 starts[axis] = start_value;
1119
1120 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1121 ends[axis] = end_value;
1122
1123 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1124 strides[axis] = strides_value;
1125 }
1126 }
1127
1128 // Set mask bits such as order of inputData
1129 // FIXME Take the layouts into account.
1130 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1131 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1132 const auto shrink_axis_mask =
1133 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1134
1135 ::arm_compute::Coordinates starts_set;
1136 ::arm_compute::Coordinates ends_set;
1137 ::arm_compute::BiStrides strides_set;
1138
1139 for (size_t i = 0; i < starts.size(); ++i)
1140 {
1141 starts_set.set(i, starts[i]);
1142 ends_set.set(i, ends[i]);
1143 strides_set.set(i, strides[i]);
1144 }
1145
1146 // Disable applied dim_correction
1147 if (static_cast<size_t>(inputData_tensor->getShape().rank()) !=
1148 inputData_tensor->info()->num_dimensions())
1149 {
1150 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1151 acl_common::disableDimCorrection(inputData_tensor);
1152 }
1153
1154 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1155 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1156 begin_mask, end_mask, shrink_axis_mask);
1157
1158 // Revert disabling applied dim_correction
1159 if (inputData_tensor->getShape().dim(0) == 1)
1160 {
1161 acl_common::enableDimCorrection(inputData_tensor);
1162 }
1163
1164 _return_fn = asAclFunction(std::move(fn));
1165}
1166
1167void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1168{
1169 const auto ofm_index{node.getOutputs().at(0)};
1170 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1171 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1172
1173 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
1174 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
1175 const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
1176
1177 const auto stride = node.param().stride;
1178
1179 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1180 (node.param().padding.type == ir::PaddingType::VALID));
1181 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1182 ker_shape.W, ker_shape.H);
1183
1184 uint32_t invalid_horizontal = 0;
1185 uint32_t invalid_vertical = 0;
1186 if (node.param().padding.type == ir::PaddingType::VALID)
1187 {
1188 invalid_horizontal =
1189 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1190 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1191 }
1192
1193 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1194 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1195 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1196
1197 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1198
1199 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1200 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1201 invalid_horizontal, invalid_vertical);
1202
1203 _return_fn = asAclFunction(std::move(fn));
1204}
1205
1206void KernelGenerator::visit(const ir::operation::Transpose &node)
1207{
1208 const auto ofm_idx{node.getOutputs().at(0)};
1209 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1210 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1211
1212 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1213 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1214 const auto rank = _ctx.at(ifm_idx).shape().rank();
1215
1216 const auto &perms = _ctx.at(perm_idx);
1217 std::vector<int32_t> pv;
1218 if (perms.shape() == ir::Shape{0})
1219 {
1220 pv.resize(rank);
1221 std::iota(pv.begin(), pv.end(), 0);
1222 std::reverse(pv.begin(), pv.end());
1223 }
1224 else
1225 {
1226 pv = _ctx.at(perm_idx).asVector<int32_t>();
1227 }
1228
1229 std::unique_ptr<arm_compute::IFunction> fn;
1230 if (rank == 1)
1231 {
1232 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1233 }
1234 else if (rank == 2)
1235 {
1236 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1237 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1238 ofm_tensor->handle());
1239 }
1240 else
1241 {
1242 auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv);
1243
1244 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1245 ofm_tensor->handle(), backend_pv);
1246 }
1247 _return_fn = asAclFunction(std::move(fn));
1248}
1249
1250void KernelGenerator::visit(const ir::operation::Unpack &node)
1251{
1252 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1253 auto axis{node.param().axis};
1254
1255 const auto input_rank = _ctx.at(input_index).shape().rank();
1256
1257 std::vector<ir::OperandIndex> output_indexes;
1258 for (const auto &output_index : node.getOutputs())
1259 output_indexes.emplace_back(output_index);
1260
1261 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1262 std::vector<arm_compute::ITensor *> outputs;
1263 for (const auto &output_index : output_indexes)
1264 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1265
1266 if (axis < 0)
1267 axis += input_rank;
1268 axis = acl_common::ToARMComputeAxis(input_rank, axis).value();
1269
1270 // Disable applied dim_correction
1271 if (static_cast<size_t>(input_tensor->getShape().rank()) !=
1272 input_tensor->info()->num_dimensions())
1273 {
1274 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1276 }
1277
1278 auto fn =
1279 acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1280
1281 // Revert disabling applied dim_correction
1282 if (input_tensor->getShape().dim(0) == 1)
1283 {
1284 acl_common::enableDimCorrection(input_tensor);
1285 }
1286
1287 _return_fn = asAclFunction(std::move(fn));
1288}
1289
1290void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1291{
1292 const auto output_index{node.getOutputs().at(0)};
1293 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1294
1295 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1296 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1297
1298 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1299 output_tensor->handle());
1300
1301 _return_fn = asAclFunction(std::move(fn));
1302}
1303
1304void KernelGenerator::visit(const ir::operation::Comparison &node)
1305{
1306 const auto output_index{node.getOutputs().at(0)};
1307 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1308 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1309
1310 const auto comparison_type = node.param().comparison_type;
1311
1312 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1313 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1314 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1315
1316 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1317 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1318 (arm_compute::ComparisonOperation)comparison_type);
1319
1320 _return_fn = asAclFunction(std::move(fn));
1321}
1322
1323void KernelGenerator::visit(const ir::operation::OneHot &node)
1324{
1325 const auto out_idx{node.getOutputs().at(0)};
1326 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1327 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1328 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1329 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1330
1331 auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1332 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1333 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1334 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1335 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1336
1337 const size_t output_rank = _ctx.at(out_idx).shape().rank();
1338 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1339 axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
1340
1341 auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1342 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1343 offvalue_tensor->handle(), output_tensor->handle(), axis);
1344 _return_fn = asAclFunction(std::move(fn));
1345}
1346
1347} // namespace onert::backend::acl_neon
This file defines NopFunction.
Class to run FullyConnected Layer after reshaping input tensor.
static std::unique_ptr< exec::IFunction > generate(ir::Activation code, T_Tensor *ifm_alloc)
Tensor registry class for acl backends.
std::unique_ptr< exec::FunctionSequence > generate(ir::OperationIndex ind) override
KernelGenerator(const ir::Graph &graph, const std::shared_ptr< TensorBuilder > &tensor_builder, const std::shared_ptr< acl_common::AclTensorRegistry< TensorManager > > &_tensor_reg)
std::unique_ptr< exec::IFunction > _return_fn
std::unique_ptr< exec::IFunction > releaseFunction()
const Operations & operations() const override
Definition Graph.h:112
const OperandIndex & at(IOIndex set_index) const
const OperandIndexSequence & getOutputs() const override
Definition Operation.h:53
OperandIndexSequence & getInputs()
Definition Operation.h:51
const Param & param() const
Definition ArgMinMax.h:49
const Object & at(const Index &index) const
Get the object that is associated with the given index.
#define VERBOSE(name, lv)
Definition Log.h:71
std::vector< int > dims(const std::string &src)
Definition Utils.h:35
arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir)
Definition Convert.cc:279
ARMComputeAxis ToARMComputeAxis(uint32_t rank, uint32_t axis)
Definition Swizzle.h:45
inline ::arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank, const std::vector< int32_t > runtime_pv)
Definition Swizzle.h:68
std::unique_ptr< exec::IFunction > kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
::arm_compute::ActivationLayerInfo asActivationLayerInfo(const ir::Activation act_code)
Definition Convert.cc:131
arm_compute::ReductionOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir)
Definition Convert.cc:294
arm_compute::Coordinates asCoordinates(const ir::Operand &operand, int32_t rank)
Definition Convert.cc:207
void enableDimCorrection(IACLTensor *tensor)
arm_compute::Size2D asDilation(uint32_t dilation_width, uint32_t dilation_height)
Definition Convert.cc:330
::arm_compute::PadStrideInfo asPadStrideInfo(const ir::ExplicitPadding &padding, const ir::Stride &stride)
Definition Convert.cc:119
std::unique_ptr< AclFunction > asAclFunction(std::unique_ptr<::arm_compute::IFunction > &&layer)
Definition Convert.cc:246
std::unique_ptr< exec::IFunction > kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, const std::shared_ptr< T_TensorBuilder > &tensor_builder, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
::arm_compute::DataType asDataType(const ir::DataType type)
Definition Convert.cc:71
void disableDimCorrection(IACLTensor *tensor)
const ExplicitPadding calculatePadding(const Padding &padding, const FeatureShape &ifm_shape, const FeatureShape &ofm_shape, const Stride &stride, uint32_t kw, uint32_t kh, uint32_t dwf=1, uint32_t dhf=1)
Definition Padding.cc:131
CLTensor ker_tensor
CLTensor bias_tensor
OperandType type
Definition Operand.h:42