ONE - On-device Neural Engine
Loading...
Searching...
No Matches
KernelGenerator.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "KernelGenerator.h"
18
19#include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20#include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
21
23#include <AclFunction.h>
24#include <Convert.h>
25#include <Swizzle.h>
26
27#include "ir/Index.h"
28#include "ir/DataType.h"
29#include "ir/InternalType.h"
30#include "exec/NopFunction.h"
31#include "util/logging.h"
32#include "AclKernelGen.h"
33
34namespace onert
35{
36namespace backend
37{
38namespace acl_neon
39{
40
41using ::onert::backend::acl_common::asAclFunction;
43 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
44
46 const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
47 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
48 : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()),
49 _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
50{
51 // DO NOTHING
52}
53
54std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
55{
56 auto ret = std::make_unique<exec::FunctionSequence>();
57 ret->enableDynamicShapeInferer(false);
58
59 const auto &op = _graph.operations().at(ind);
60 op.accept(*this);
61 ret->append(releaseFunction());
62 return ret;
63}
64
65void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
66{
67 const auto ofm_index{node.getOutputs().at(0)};
68 const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
69 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
70
71 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
72
73 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
74 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
75
76 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
77 if (axis_value < 0)
78 {
79 axis_value += ifm_rank;
80 }
81 assert(axis_value >= 0 && axis_value < ifm_rank);
82 const auto fixed_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
83 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
84 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
85
86 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
87 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
88
89 _return_fn = asAclFunction(std::move(fn));
90}
91
92void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
93{
94 const auto ofm_index{node.getOutputs().at(0)};
95 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
96 const auto block_size_index{
98
99 const auto NNApiInputs = 2;
100 if (node.getInputs().size() != NNApiInputs)
101 {
102 const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
103 if (!_ctx.at(crops_index).isConstant())
104 {
105 throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
106 }
107
108 auto crops = _ctx.at(crops_index).asVector<int32_t>();
109 for (auto &&crop : crops)
110 {
111 if (crop != 0)
112 {
113 throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
114 }
115 }
116 }
117
118 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
119 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
120
121 if (!_ctx.at(block_size_index).data())
122 throw std::runtime_error("ACL NEON does not support dynamic block size for BatchToSpaceND");
123
124 auto block = _ctx.at(block_size_index).asVector<int32_t>();
125 int32_t height = block[0];
126 int32_t width = block[1];
127
128 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
129 ifm_tensor->handle(), width, height, ofm_tensor->handle());
130
131 _return_fn = asAclFunction(std::move(fn));
132}
133
134void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
135{
136 const auto ofm_index{node.getOutputs().at(0)};
137 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
138 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
139
140 const auto activation = node.param().activation;
141
142 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
143 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
144 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
145
146 std::unique_ptr<arm_compute::IFunction> fn;
147 switch (node.param().arithmetic_type)
148 {
150 {
151 arm_compute::NEArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(),
152 ofm_tensor->info(),
153 arm_compute::ConvertPolicy::SATURATE)
154 .throw_if_error();
155 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
156 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
157 arm_compute::ConvertPolicy::SATURATE);
158 break;
159 }
161 {
162 arm_compute::NEArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(),
163 ofm_tensor->info(),
164 arm_compute::ConvertPolicy::SATURATE)
165 .throw_if_error();
166 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
167 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
168 arm_compute::ConvertPolicy::SATURATE);
169 break;
170 }
172 {
173 arm_compute::NEPixelWiseMultiplication::validate(
174 lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0,
175 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO)
176 .throw_if_error();
177 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
178 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
179 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
180 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
181 break;
182 }
184 {
185 arm_compute::NEElementwiseDivision::validate(lhs_tensor->info(), rhs_tensor->info(),
186 ofm_tensor->info())
187 .throw_if_error();
188 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
189 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
190 break;
191 }
192 default:
193 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
194 break;
195 }
196 _return_fn = std::make_unique<exec::FunctionSequence>(
197 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
198}
199
200void KernelGenerator::visit(const ir::operation::Conv2D &node)
201{
202 using ir::operation::Conv2D;
203
204 const auto ofm_index{node.getOutputs().at(0)};
205 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
206 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
207 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
208
209 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
210 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
211 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
212 const auto &ker_shape = _ctx.at(ker_index).shape();
213 const auto ker_height = ker_shape.dim(1);
214 const auto ker_width = ker_shape.dim(2);
215
216 const auto stride = node.param().stride;
217 const auto padding =
218 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
219 const auto activation = node.param().activation;
220
221 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
222 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
223 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
224 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
225
226 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
227 const auto act_info = acl_common::asActivationLayerInfo(activation);
228
229 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
230 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
231 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
232 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
233
234 _return_fn = asAclFunction(std::move(fn));
235}
236
237void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
238{
239 const auto output_index{node.getOutputs().at(0)};
240 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
241
242 auto block_size = node.param().block_size;
243 assert(block_size > 0);
244
245 auto output_tensor = _tensor_reg->getAclTensor(output_index);
246 auto input_tensor = _tensor_reg->getAclTensor(input_index);
247
248 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
249 input_tensor->handle(), output_tensor->handle(), block_size);
250
251 _return_fn = asAclFunction(std::move(fn));
252}
253
254void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
255{
256 using ir::operation::DepthwiseConv2D;
257
258 const auto ofm_index{node.getOutputs().at(0)};
259 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
260 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
261 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
262
263 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
264 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
265 // Kernel format is [1, kernel_height, kernel_width, depth_out].
266 const auto &ker_shape = _ctx.at(ker_index).shape();
267 const auto ker_height = ker_shape.dim(1);
268 const auto ker_width = ker_shape.dim(2);
269
270 const auto stride = node.param().stride;
271 const auto dilation = node.param().dilation;
272 const auto padding =
273 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
274 dilation.width_factor, dilation.height_factor);
275 const auto multiplier = node.param().multiplier;
276 const auto activation = node.param().activation;
277
278 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
279 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
280 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
281 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
282
283 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
284 const auto act_info = acl_common::asActivationLayerInfo(activation);
285 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
286
287 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
288 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
289 conv_info, multiplier, act_info, dilation_info);
290
291 _return_fn = asAclFunction(std::move(fn));
292}
293
294void KernelGenerator::visit(const ir::operation::Concat &node)
295{
296 const auto ofm_index{node.getOutputs().at(0)};
297
298 std::vector<ir::OperandIndex> input_indexes;
299 for (const auto &input : node.getInputs())
300 input_indexes.emplace_back(input);
301
302 const auto axis = node.param().axis;
303
304 // Concat elimination check
305 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
306 if (eliminated)
307 {
308 // If concat eliminated, return a NOP IFunction
309 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
310 _return_fn = std::make_unique<exec::NopFunction>();
311 return;
312 }
313
314 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
315 std::vector<const ::arm_compute::ITensor *> input_tensors;
316 for (const auto &ifm_ind : input_indexes)
317 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
318
319 std::unique_ptr<::arm_compute::IFunction> fn;
320 if (input_indexes.size() < 2)
321 {
322 ::arm_compute::ITensor *input_tesor = _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
323 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tesor, output_tensor->handle());
324 }
325 else
326 {
327 const auto rank = _ctx.at(ofm_index).shape().rank();
328 const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis).value();
329 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
330 input_tensors, output_tensor->handle(), fixed_axis);
331 }
332
333 _return_fn = asAclFunction(std::move(fn));
334}
335
336void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
337{
338 const auto ofm_index{node.getOutputs().at(0)};
339 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
340
341 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
342 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
343
344 const ::arm_compute::ActivationLayerInfo act_info =
345 acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
346
347 std::unique_ptr<arm_compute::IFunction> fn =
348 acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
349 ofm_tensor->handle(), act_info);
350
351 _return_fn = asAclFunction(std::move(fn));
352}
353
354void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
355{
356 const auto output_index{node.getOutputs().at(0)};
357 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
358 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
359
360 auto output_tensor = _tensor_reg->getAclTensor(output_index);
361 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
362 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
363
364 std::unique_ptr<arm_compute::IFunction> fn;
365 switch (node.param().op_type)
366 {
368 {
369 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
370 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
371 break;
372 }
374 {
375 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
376 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
377 break;
378 }
380 {
381 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
382 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
383 break;
384 }
386 {
387 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
388 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
389 break;
390 }
391 default:
392 {
393 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
394 "is not elementwise-binary operations");
395 assert(false && err_msg.c_str());
396 break;
397 }
398 }
399 _return_fn = asAclFunction(std::move(fn));
400}
401
402void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
403{
404 const auto output_index{node.getOutputs().at(0)};
405 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
406
407 auto output_tensor = _tensor_reg->getAclTensor(output_index);
408 auto input_tensor = _tensor_reg->getAclTensor(input_index);
409
410 std::unique_ptr<arm_compute::IFunction> fn;
411 switch (node.param().op_type)
412 {
414 {
415 const ::arm_compute::ActivationLayerInfo act_info{
416 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
417
418 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
419 input_tensor->handle(), output_tensor->handle(), act_info);
420 break;
421 }
423 {
424 if (input_tensor->data_type() == output_tensor->data_type())
425 {
426 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
427 output_tensor->handle());
428 }
429 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
430 {
431 fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
432 output_tensor->handle());
433 }
434 else
435 {
436 fn = acl_common::generateLayer<arm_compute::NECast>(
437 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
438 }
439 break;
440 }
442 {
443 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
444 output_tensor->handle());
445 break;
446 }
448 {
449 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
450 output_tensor->handle());
451 break;
452 }
454 {
455 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
456 output_tensor->handle());
457 break;
458 }
460 {
461 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
462 output_tensor->handle());
463 break;
464 }
466 {
467 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
468 output_tensor->handle());
469 break;
470 }
472 {
473 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
474 output_tensor->handle());
475 break;
476 }
478 {
479 const ::arm_compute::ActivationLayerInfo act_info{
480 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
481
482 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
483 input_tensor->handle(), output_tensor->handle(), act_info);
484 break;
485 }
486 default:
487 {
488 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
489 "is not supported yet");
490 break;
491 }
492 }
493 _return_fn = asAclFunction(std::move(fn));
494}
495
496void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
497{
498 const auto output_index{node.getOutputs().at(0)};
499 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
500 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
501
502 auto output_tensor = _tensor_reg->getAclTensor(output_index);
503 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
504 auto values_tensor = _tensor_reg->getAclTensor(values_index);
505
506 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
507 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
508
509 _return_fn = asAclFunction(std::move(fn));
510}
511
512void KernelGenerator::visit(const ir::operation::FullyConnected &node)
513{
514 const auto output_index{node.getOutputs().at(0)};
515 auto output_tensor = _tensor_reg->getAclTensor(output_index);
516 const auto activation = node.param().activation;
517 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
518 throw std::runtime_error(
519 "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
520
521 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
523 node, _ctx, _tensor_builder, _tensor_reg);
524 _return_fn = std::make_unique<exec::FunctionSequence>(
525 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
526}
527
528void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
529{
530 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
531 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
532
533 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
534 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
535 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
536
537 auto output_tensor = _tensor_reg->getAclTensor(output_index);
538 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
539
540 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
541 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
542 auto values_tensor = _tensor_reg->getAclTensor(values_index);
543
544 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
545 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
546 output_tensor->handle(), hits_tensor->handle());
547
548 _return_fn = asAclFunction(std::move(fn));
549}
550
551void KernelGenerator::visit(const ir::operation::Gather &node)
552{
553 const auto ofm_index{node.getOutputs().at(0)};
554
555 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
556 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
557
558 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
559 const auto axis_raw = node.param().axis;
560 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
561 // Converting in reverse order
562 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
563
564 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
565 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
566 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
567
568 // input is n-D, indices k-D, output is (n + k - 1)-D
569 size_t n = ifm_rank;
570 assert(n == ifm_tensor->num_dimensions());
571 size_t k = _ctx.at(indices_index).shape().rank();
572 assert(k == indices_tensor->num_dimensions());
573
574 // Disable applied dim_correction
575 if (n != ifm_tensor->info()->num_dimensions())
576 {
577 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
579 }
580 if (k != indices_tensor->info()->num_dimensions())
581 {
582 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
583 acl_common::disableDimCorrection(indices_tensor);
584 }
585
586 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
587 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
588
589 // Revert disabling applied dim_correction
590 if (ifm_tensor->dimension(0) == 1)
591 {
593 }
594 if (indices_tensor->dimension(0) == 1)
595 {
596 acl_common::enableDimCorrection(indices_tensor);
597 }
598
599 _return_fn = asAclFunction(std::move(fn));
600}
601
602void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
603{
604 const auto ofm_index{node.getOutputs().at(0)};
605 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
606 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
607 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
608
609 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
610 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
611 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
612 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
613 auto epsilon = node.param().epsilon;
614 auto activation = node.param().activation;
615
616 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
617 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
618 epsilon);
619
620 _return_fn = std::make_unique<exec::FunctionSequence>(
621 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
622}
623
624void KernelGenerator::visit(const ir::operation::L2Normalization &node)
625{
626 const auto ofm_index{node.getOutputs().at(0)};
627 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
628
629 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
630 // L2 Normalization always performs the reduction along the depth axis
631 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
632 // choosing normalization parameters as below
633
634 const auto &ifm_shape = _ctx.at(ifm_index).shape();
635 // TODO Support optional constant dimension that normalization would be performed on
636 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
637 int32_t radius =
638 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
639 float alpha = 1.0f; // In the implementation to make alpha_ become 1
640 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
641 float bias = 0.0f; // Don't offset the reduction.
642
643 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
644 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
645
646 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
647 radius, alpha, beta, bias, false);
648
649 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
650 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
651
652 _return_fn = asAclFunction(std::move(fn));
653}
654
655void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
656{
657 const auto ofm_index{node.getOutputs().at(0)};
658 const auto ifm_index{
660
661 auto radius = node.param().radius;
662 auto alpha = node.param().alpha;
663 auto beta = node.param().beta;
664 auto bias = node.param().bias;
665
666 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
667 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
668
669 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
670 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
671
672 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
673 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
674
675 _return_fn = asAclFunction(std::move(fn));
676}
677
678void KernelGenerator::visit(const ir::operation::LSTM &node)
679{
680 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
681 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
682}
683
684void KernelGenerator::visit(const ir::operation::Pack &node)
685{
686 const auto output_index{node.getOutputs().at(0)};
687 auto axis{node.param().axis};
688
689 const auto output_rank = _ctx.at(output_index).shape().rank();
690
691 std::vector<ir::OperandIndex> input_indexes;
692 for (const auto &input_index : node.getInputs())
693 input_indexes.emplace_back(input_index);
694
695 auto output = _tensor_reg->getAclTensor(output_index)->handle();
696 std::vector<arm_compute::ITensor *> inputs;
697 for (const auto &input_index : input_indexes)
698 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
699
700 if (axis < 0)
701 axis += output_rank;
702 axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
703
704 // Disable applied dim_correction
705 for (const auto &input_index : input_indexes)
706 {
707 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
708 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
709 {
710 // This means that high dimension's value is 1 and input tensor is applied dim_correction
712 }
713 }
714
715 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
716
717 // Revert disabling applied dim_correction
718 for (const auto &input_index : input_indexes)
719 {
720 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
721 if (input_tensor->dimension(0) == 1)
722 {
724 }
725 }
726
727 _return_fn = asAclFunction(std::move(fn));
728}
729
730void KernelGenerator::visit(const ir::operation::Pad &node)
731{
732 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
733 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
734 const auto output_index{node.getOutputs().at(0)};
735 assert(_ctx.at(pad_index).data());
736
737 auto rank = _ctx.at(input_index).shape().rank();
738 auto pad_base = _ctx.at(pad_index).data()->base();
739
740 auto input = _tensor_reg->getAclTensor(input_index)->handle();
741 auto output = _tensor_reg->getAclTensor(output_index)->handle();
742
743 ::arm_compute::PaddingList padding_list;
744 padding_list.resize(rank);
745 for (int32_t n = 0; n < rank; ++n)
746 {
747 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
748
749 const auto axis = acl_common::ToARMComputeAxis(rank, n).value();
750 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
751 }
752
753 [[maybe_unused]] const auto input_type = _ctx.at(input_index).typeInfo();
754 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
755 assert(input->info()->quantization_info() ==
756 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point()));
757 const auto pixel_value =
758 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
759
760 auto fn =
761 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
762
763 _return_fn = asAclFunction(std::move(fn));
764}
765
766void KernelGenerator::visit(const ir::operation::Pool2D &node)
767{
768 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
769 node, _ctx, _tensor_reg, acl_common::convertPoolType(node.param().op_type));
770
771 const auto ofm_index{node.getOutputs().at(0)};
772 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
773 const auto activation = node.param().activation;
774 _return_fn = std::make_unique<exec::FunctionSequence>(
775 asAclFunction(std::move(raw_fn)),
776 ActivationBuilder::generate(activation, ofm_tensor->handle()));
777}
778
779void KernelGenerator::visit(const ir::operation::PReLU &node)
780{
781 const auto ofm_index{node.getOutputs().at(0)};
782 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
783 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
784
785 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
786 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
787 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
788
789 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
790 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
791
792 _return_fn = asAclFunction(std::move(fn));
793}
794
795void KernelGenerator::visit(const ir::operation::Reduce &node)
796{
797 const auto output_index{node.getOutputs().at(0)};
798 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
799 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
800
801 auto output_tensor = _tensor_reg->getAclTensor(output_index);
802 auto input_tensor = _tensor_reg->getAclTensor(input_index);
803
804 // Convert to ACL axes taking into account negative values and possible duplicates.
805 const auto &axes = _ctx.at(axes_index);
806 const auto input_rank = _ctx.at(input_index).shape().rank();
807 const auto reduce_axes = acl_common::asCoordinates(axes, input_rank);
808 const auto reduce_type = node.param().reduce_type;
809 const auto keep_dims = node.param().keep_dims;
810
811 std::unique_ptr<::arm_compute::IFunction> fn;
813 {
814 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
815 keep_dims, output_tensor->handle());
816 }
817 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
818 {
819 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
820 keep_dims, output_tensor->handle());
821 }
822 else
823 {
824 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
825 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
826 acl_common::convertReduceType(reduce_type));
827 }
828 _return_fn = asAclFunction(std::move(fn));
829}
830
831void KernelGenerator::visit(const ir::operation::Reshape &node)
832{
833 const auto output_index{node.getOutputs().at(0)};
834 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
835
836 auto output_tensor = _tensor_reg->getAclTensor(output_index);
837 auto input_tensor = _tensor_reg->getAclTensor(input_index);
838
839 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
840 output_tensor->handle());
841
842 _return_fn = asAclFunction(std::move(fn));
843}
844
845void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
846{
847 const auto ofm_index{node.getOutputs().at(0)};
848 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
849
850 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
851 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
852
853 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
854 ifm_tensor->handle(), ofm_tensor->handle(),
855 ::arm_compute::ScaleKernelInfo{::arm_compute::InterpolationPolicy::BILINEAR,
856 ::arm_compute::BorderMode::REPLICATE,
857 ::arm_compute::PixelValue(0.f),
858 ::arm_compute::SamplingPolicy::TOP_LEFT, false /*use padding*/});
859
860 _return_fn = asAclFunction(std::move(fn));
861}
862
863void KernelGenerator::visit(const ir::operation::RNN &node)
864{
865 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
866 const auto hidden_state_out_index{
868
869 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
870 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
871 const auto recurrent_weights_index{
873 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
874 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
875
876 const auto activation = node.param().activation;
877
878 auto output_tensor = _tensor_reg->getAclTensor(output_index);
879 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
880
881 auto input_tensor = _tensor_reg->getAclTensor(input_index);
882 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
883 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
884 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
885 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
886 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
887
888 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
889 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
890 _return_fn = asAclFunction(std::move(copy_layer));
891
892 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
893 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
894 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
895 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
896 _return_fn = asAclFunction(std::move(fn));
897}
898
899void KernelGenerator::visit(const ir::operation::Squeeze &node)
900{
901 // Squeeze is identical to reshape except that it has an optional dimensions input.
902 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
903 // by freezer and toco
904 const auto output_index{node.getOutputs().at(0)};
905 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
906 const auto dims{node.param().dims};
907 const auto ndim{node.param().ndim};
908 (void)dims;
909 (void)ndim;
910
911 auto output_tensor = _tensor_reg->getAclTensor(output_index);
912 auto input_tensor = _tensor_reg->getAclTensor(input_index);
913 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
914 output_tensor->handle());
915 _return_fn = asAclFunction(std::move(fn));
916}
917
918void KernelGenerator::visit(const ir::operation::Softmax &node)
919{
920 const auto output_index{node.getOutputs().at(0)};
921 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
922 const auto beta = node.param().beta;
923
924 auto output_tensor = _tensor_reg->getAclTensor(output_index);
925 auto input_tensor = _tensor_reg->getAclTensor(input_index);
926
927 // NOTE NESoftmaxLayer's default axis is -1
928 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
929 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
930 output_tensor->handle(), beta);
931
932 _return_fn = asAclFunction(std::move(fn));
933}
934
935void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
936{
937 const auto ofm_index{node.getOutputs().at(0)};
938 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
939 const auto block_size_index{
941 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
942
943 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
944 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
945 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
946 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
947
948 assert(_ctx.at(block_size_index).data());
949 assert(_ctx.at(paddings_index).data());
950
951 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
952 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
953 ofm_tensor->handle());
954
955 _return_fn = asAclFunction(std::move(fn));
956}
957
958void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
959{
960 const auto ofm_index{node.getOutputs().at(0)};
961 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
962
963 auto block_size = node.param().block_size;
964
965 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
966 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
967
968 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
969 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
970
971 _return_fn = asAclFunction(std::move(fn));
972}
973
974void KernelGenerator::visit(const ir::operation::Split &node)
975{
976 // TODO Support this op by SubTensor
977 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
978 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
979
980 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
981 if (!_ctx.at(axis_index).isConstant())
982 {
983 throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
984 }
985
986 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
987 std::vector<ir::OperandIndex> output_indexes;
988 for (const auto &output : node.getOutputs())
989 output_indexes.emplace_back(output);
990
991 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
992 std::vector<arm_compute::ITensor *> output_tensors;
993 for (const auto &ofm_ind : output_indexes)
994 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
995
996 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
997 if (axis < 0)
998 axis += ifm_rank;
999 axis = acl_common::ToARMComputeAxis(ifm_rank, axis).value();
1000
1001 auto fn =
1002 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1003
1004 _return_fn = asAclFunction(std::move(fn));
1005}
1006
1007void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1008{
1009 const auto ofm_index{node.getOutputs().at(0)};
1010 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1011 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1012
1013 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1014 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1015 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1016
1017 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1018 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1019
1020 _return_fn = asAclFunction(std::move(fn));
1021}
1022
1023void KernelGenerator::visit(const ir::operation::Slice &node)
1024{
1025 const auto output_index{node.getOutputs().at(0)};
1026 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1027 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1028 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1029
1030 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1031 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1032
1033 // Set initializers for indices data such as order of inputData
1034 int input_rank = _ctx.at(input_index).shape().rank();
1035 std::vector<int32_t> starts;
1036 std::vector<int32_t> ends;
1037 starts.resize(input_rank, 0);
1038 ends.resize(input_rank, 0);
1039 {
1040 auto beginData_base = _ctx.at(begins_index).data()->base();
1041 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1042 [[maybe_unused]] const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1043 [[maybe_unused]] const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1044
1045 using ir::DataType;
1046
1047 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1048 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1049 assert(beginData_size == input_rank);
1050 assert(sizeData_size == input_rank);
1051
1052 assert(beginData_base != nullptr);
1053 for (int n = 0; n < input_rank; ++n)
1054 {
1055 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
1056
1057 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1058 starts[axis] = begin_value;
1059
1060 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1061 ends[axis] = begin_value + size_value;
1062 }
1063 }
1064
1065 ::arm_compute::Coordinates starts_set;
1066 ::arm_compute::Coordinates ends_set;
1067
1068 for (size_t i = 0; i < starts.size(); ++i)
1069 {
1070 starts_set.set(i, starts[i]);
1071 ends_set.set(i, ends[i]);
1072 }
1073
1074 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1075 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1076
1077 _return_fn = asAclFunction(std::move(fn));
1078}
1079
1080void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1081{
1082 const auto output_index{node.getOutputs().at(0)};
1083 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1084 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1085 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1086 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1087
1088 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1089 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1090
1091 // Set initializers for indices data such as order of inputData
1092 int input_rank = _ctx.at(input_index).shape().rank();
1093 std::vector<int32_t> starts;
1094 std::vector<int32_t> ends;
1095 std::vector<int32_t> strides;
1096 starts.resize(input_rank, 0);
1097 ends.resize(input_rank, 0);
1098 strides.resize(input_rank, 0);
1099 {
1100 auto startData_base = _ctx.at(starts_index).data()->base();
1101 auto endData_base = _ctx.at(ends_index).data()->base();
1102 auto stridesData_base = _ctx.at(strides_index).data()->base();
1103 [[maybe_unused]] const int startData_size = _ctx.at(starts_index).shape().num_elements();
1104 [[maybe_unused]] const int endData_size = _ctx.at(ends_index).shape().num_elements();
1105 [[maybe_unused]] const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1106
1107 using ir::DataType;
1108
1109 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1110 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1111 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1112 assert(startData_size == input_rank);
1113 assert(endData_size == input_rank);
1114 assert(stridesData_size == input_rank);
1115
1116 assert(startData_base != nullptr);
1117 for (int n = 0; n < input_rank; ++n)
1118 {
1119 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n).value();
1120
1121 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1122 starts[axis] = start_value;
1123
1124 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1125 ends[axis] = end_value;
1126
1127 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1128 strides[axis] = strides_value;
1129 }
1130 }
1131
1132 // Set mask bits such as order of inputData
1133 // FIXME Take the layouts into account.
1134 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1135 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1136 const auto shrink_axis_mask =
1137 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1138
1139 ::arm_compute::Coordinates starts_set;
1140 ::arm_compute::Coordinates ends_set;
1141 ::arm_compute::BiStrides strides_set;
1142
1143 for (size_t i = 0; i < starts.size(); ++i)
1144 {
1145 starts_set.set(i, starts[i]);
1146 ends_set.set(i, ends[i]);
1147 strides_set.set(i, strides[i]);
1148 }
1149
1150 // Disable applied dim_correction
1151 if (static_cast<size_t>(inputData_tensor->getShape().rank()) !=
1152 inputData_tensor->info()->num_dimensions())
1153 {
1154 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1155 acl_common::disableDimCorrection(inputData_tensor);
1156 }
1157
1158 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1159 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1160 begin_mask, end_mask, shrink_axis_mask);
1161
1162 // Revert disabling applied dim_correction
1163 if (inputData_tensor->getShape().dim(0) == 1)
1164 {
1165 acl_common::enableDimCorrection(inputData_tensor);
1166 }
1167
1168 _return_fn = asAclFunction(std::move(fn));
1169}
1170
1171void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1172{
1173 const auto ofm_index{node.getOutputs().at(0)};
1174 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1175 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1176
1177 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature();
1178 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature();
1179 const auto ker_shape = _ctx.at(ker_index).shape().asFeature();
1180
1181 const auto stride = node.param().stride;
1182
1183 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1184 (node.param().padding.type == ir::PaddingType::VALID));
1185 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1186 ker_shape.W, ker_shape.H);
1187
1188 uint32_t invalid_horizontal = 0;
1189 uint32_t invalid_vertical = 0;
1190 if (node.param().padding.type == ir::PaddingType::VALID)
1191 {
1192 invalid_horizontal =
1193 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1194 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1195 }
1196
1197 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1198 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1199 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1200
1201 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1202
1203 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1204 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1205 invalid_horizontal, invalid_vertical);
1206
1207 _return_fn = asAclFunction(std::move(fn));
1208}
1209
1210void KernelGenerator::visit(const ir::operation::Transpose &node)
1211{
1212 const auto ofm_idx{node.getOutputs().at(0)};
1213 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1214 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1215
1216 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1217 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1218 const auto rank = _ctx.at(ifm_idx).shape().rank();
1219
1220 const auto &perms = _ctx.at(perm_idx);
1221 std::vector<int32_t> pv;
1222 if (perms.shape() == ir::Shape{0})
1223 {
1224 pv.resize(rank);
1225 std::iota(pv.begin(), pv.end(), 0);
1226 std::reverse(pv.begin(), pv.end());
1227 }
1228 else
1229 {
1230 pv = _ctx.at(perm_idx).asVector<int32_t>();
1231 }
1232
1233 std::unique_ptr<arm_compute::IFunction> fn;
1234 if (rank == 1)
1235 {
1236 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1237 }
1238 else if (rank == 2)
1239 {
1240 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1241 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1242 ofm_tensor->handle());
1243 }
1244 else
1245 {
1246 auto backend_pv = acl_common::getARMComputePermutationVector(rank, pv);
1247
1248 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1249 ofm_tensor->handle(), backend_pv);
1250 }
1251 _return_fn = asAclFunction(std::move(fn));
1252}
1253
1254void KernelGenerator::visit(const ir::operation::Unpack &node)
1255{
1256 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1257 auto axis{node.param().axis};
1258
1259 const auto input_rank = _ctx.at(input_index).shape().rank();
1260
1261 std::vector<ir::OperandIndex> output_indexes;
1262 for (const auto &output_index : node.getOutputs())
1263 output_indexes.emplace_back(output_index);
1264
1265 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1266 std::vector<arm_compute::ITensor *> outputs;
1267 for (const auto &output_index : output_indexes)
1268 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1269
1270 if (axis < 0)
1271 axis += input_rank;
1272 axis = acl_common::ToARMComputeAxis(input_rank, axis).value();
1273
1274 // Disable applied dim_correction
1275 if (static_cast<size_t>(input_tensor->getShape().rank()) !=
1276 input_tensor->info()->num_dimensions())
1277 {
1278 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1280 }
1281
1282 auto fn =
1283 acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1284
1285 // Revert disabling applied dim_correction
1286 if (input_tensor->getShape().dim(0) == 1)
1287 {
1288 acl_common::enableDimCorrection(input_tensor);
1289 }
1290
1291 _return_fn = asAclFunction(std::move(fn));
1292}
1293
1294void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1295{
1296 const auto output_index{node.getOutputs().at(0)};
1297 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1298
1299 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1300 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1301
1302 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1303 output_tensor->handle());
1304
1305 _return_fn = asAclFunction(std::move(fn));
1306}
1307
1308void KernelGenerator::visit(const ir::operation::Comparison &node)
1309{
1310 const auto output_index{node.getOutputs().at(0)};
1311 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1312 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1313
1314 const auto comparison_type = node.param().comparison_type;
1315
1316 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1317 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1318 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1319
1320 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1321 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1322 (arm_compute::ComparisonOperation)comparison_type);
1323
1324 _return_fn = asAclFunction(std::move(fn));
1325}
1326
1327void KernelGenerator::visit(const ir::operation::OneHot &node)
1328{
1329 const auto out_idx{node.getOutputs().at(0)};
1330 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1331 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1332 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1333 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1334
1335 auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1336 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1337 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1338 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1339 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1340
1341 const size_t output_rank = _ctx.at(out_idx).shape().rank();
1342 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1343 axis = acl_common::ToARMComputeAxis(output_rank, axis).value();
1344
1345 auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1346 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1347 offvalue_tensor->handle(), output_tensor->handle(), axis);
1348 _return_fn = asAclFunction(std::move(fn));
1349}
1350
1351} // namespace acl_neon
1352} // namespace backend
1353} // namespace onert
This file defines NopFunction.
Class to run FullyConnected Layer after reshaping input tensor.
static std::unique_ptr< exec::IFunction > generate(ir::Activation code, T_Tensor *ifm_alloc)
Tensor registry class for acl backends.
std::unique_ptr< exec::FunctionSequence > generate(ir::OperationIndex ind) override
KernelGenerator(const ir::Graph &graph, const std::shared_ptr< TensorBuilder > &tensor_builder, const std::shared_ptr< acl_common::AclTensorRegistry< TensorManager > > &_tensor_reg)
std::unique_ptr< exec::IFunction > _return_fn
std::unique_ptr< exec::IFunction > releaseFunction()
const Operations & operations() const override
Definition Graph.h:114
const OperandIndex & at(IOIndex set_index) const
const OperandIndexSequence & getOutputs() const override
Definition Operation.h:55
OperandIndexSequence & getInputs()
Definition Operation.h:53
const Param & param() const
Definition ArgMinMax.h:53
const Object & at(const Index &index) const
Get the object that is associated with the given index.
#define VERBOSE(name, lv)
Definition Log.h:71
arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir)
Definition Convert.cc:283
ARMComputeAxis ToARMComputeAxis(uint32_t rank, uint32_t axis)
Definition Swizzle.h:49
inline ::arm_compute::PermutationVector getARMComputePermutationVector(uint32_t rank, const std::vector< int32_t > runtime_pv)
Definition Swizzle.h:72
std::unique_ptr< exec::IFunction > kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
::arm_compute::ActivationLayerInfo asActivationLayerInfo(const ir::Activation act_code)
Definition Convert.cc:135
arm_compute::ReductionOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir)
Definition Convert.cc:298
arm_compute::Coordinates asCoordinates(const ir::Operand &operand, int32_t rank)
Definition Convert.cc:211
void enableDimCorrection(IACLTensor *tensor)
arm_compute::Size2D asDilation(uint32_t dilation_width, uint32_t dilation_height)
Definition Convert.cc:334
::arm_compute::PadStrideInfo asPadStrideInfo(const ir::ExplicitPadding &padding, const ir::Stride &stride)
Definition Convert.cc:123
std::unique_ptr< AclFunction > asAclFunction(std::unique_ptr<::arm_compute::IFunction > &&layer)
Definition Convert.cc:250
std::unique_ptr< exec::IFunction > kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, const std::shared_ptr< T_TensorBuilder > &tensor_builder, const std::shared_ptr< T_TensorRegistry > &tensor_reg)
::arm_compute::DataType asDataType(const ir::DataType type)
Definition Convert.cc:75
void disableDimCorrection(IACLTensor *tensor)
const ExplicitPadding calculatePadding(const Padding &padding, const FeatureShape &ifm_shape, const FeatureShape &ofm_shape, const Stride &stride, uint32_t kw, uint32_t kh, uint32_t dwf=1, uint32_t dhf=1)
Definition Padding.cc:133
OperandType type
Definition Operand.h:42