ONE - On-device Neural Engine
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
GRU.cpp
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <core/OMDataType.h>
18#include "OMStatus.h"
19
20#include "core/OMUtils.h"
21#include "core/OMKernelData.h"
23
25#include "execute/OMUtils.h"
27
28#include "PALGRU.h"
29
30using namespace onert_micro;
31using namespace onert_micro::core;
32using namespace onert_micro::execute;
33
34namespace
35{
36
37constexpr uint32_t inputTensorIdx = 0;
38constexpr uint32_t hiddenHiddenTensorIdx = 1;
39constexpr uint32_t hiddenHiddenBiasTensorIdx = 2;
40constexpr uint32_t hiddenInputTensorIdx = 3;
41constexpr uint32_t hiddenInputBiasTensorIdx = 4;
42constexpr uint32_t stateTensorIdx = 5;
43
44constexpr uint32_t outputTensorIdx = 0;
45
46} // namespace
47
48// NOTE: doesnt currently support dynamic shapes
49namespace onert_micro
50{
51namespace execute
52{
53
55{
56 core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
57 core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
58 uint16_t op_index = execute_args.kernel_index;
59
60 const circle::Tensor *input;
61 const circle::Tensor *hidden_hidden;
62 const circle::Tensor *hidden_hidden_bias;
63 const circle::Tensor *hidden_input;
64 const circle::Tensor *hidden_input_bias;
65 const circle::Tensor *state;
66
67 const circle::Tensor *output;
68
69 uint8_t *input_data;
70 uint8_t *hidden_hidden_data;
71 uint8_t *hidden_hidden_bias_data;
72 uint8_t *hidden_input_data;
73 uint8_t *hidden_input_bias_data;
74 uint8_t *state_data;
75 uint8_t *output_data;
76
77 uint16_t state_tensor_index = 0;
78
79 // Read kernel
80 {
81 execute::OMRuntimeKernel runtime_kernel;
82 runtime_kernel.readKernel(op_index, runtime_context);
83
84 input = runtime_kernel.inputs[inputTensorIdx];
85 hidden_hidden = runtime_kernel.inputs[hiddenHiddenTensorIdx];
86 hidden_hidden_bias = runtime_kernel.inputs[hiddenHiddenBiasTensorIdx];
87 hidden_input = runtime_kernel.inputs[hiddenInputTensorIdx];
88 hidden_input_bias = runtime_kernel.inputs[hiddenInputBiasTensorIdx];
89 state = runtime_kernel.inputs[stateTensorIdx];
90
91 output = runtime_kernel.outputs[outputTensorIdx];
92 assert(input != nullptr);
93 assert(hidden_hidden != nullptr);
94 assert(hidden_input != nullptr);
95 assert(state != nullptr);
96 // Biases can be nullptr
97 assert(output != nullptr);
98
99 runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
100
101 input_data = runtime_kernel.inputs_data[inputTensorIdx];
102 hidden_hidden_data = runtime_kernel.inputs_data[hiddenHiddenTensorIdx];
103 hidden_hidden_bias_data = runtime_kernel.inputs_data[hiddenHiddenBiasTensorIdx];
104 hidden_input_data = runtime_kernel.inputs_data[hiddenInputTensorIdx];
105 hidden_input_bias_data = runtime_kernel.inputs_data[hiddenInputBiasTensorIdx];
106 state_data = runtime_kernel.inputs_data[stateTensorIdx];
107
108 output_data = runtime_kernel.outputs_data[outputTensorIdx];
109 assert(input_data != nullptr);
110 assert(hidden_hidden_data != nullptr);
111 assert(hidden_input_data != nullptr);
112 assert(state_data != nullptr);
113 // Bias can be nullptr
114 assert(output_data != nullptr);
115
116 state_tensor_index = runtime_kernel.inputs_index[stateTensorIdx];
117 }
118
119 OMStatus status;
120
121 uint8_t *output_hidden_data;
122 uint8_t *output_input_data;
123
124 status =
126 sizeof(core::OMDataType(hidden_hidden->type())),
127 &output_hidden_data);
128 if (status != Ok)
129 return status;
131 core::OMRuntimeShape(hidden_input).flatSize() * sizeof(core::OMDataType(hidden_input->type())),
132 &output_input_data);
133 if (status != Ok)
134 return status;
135
136 // If train mode need to allocate memory for internal intermediate tensors for calculation
137 // gradients further Number of intermediate tensors
138 const int32_t num_of_intermediate_tensors = 9;
139 // Note: size of the intermediate is equal to output size (should be checked during import phase)
140 const int32_t size_of_intermediate_tensors = core::OMRuntimeShape(output).flatSize();
141 assert(size_of_intermediate_tensors > 0);
142 if (size_of_intermediate_tensors == 0)
143 return UnknownError;
144
145 const int32_t input_size = core::OMRuntimeShape(input).flatSize();
146 const int32_t output_size = size_of_intermediate_tensors;
147
148 // Allocate buffer with following schema:
149 // times * [output_size * sizeof(data_type),
150 // num_of_intermediate_tensors * size_of_intermediate_tensors * sizeof(data_type)]
151 // Note: need to save all necessary intermediate data to calculate gradients
152 // Deallocation should perform train/GRU kernel
153 const size_t data_type_size = sizeof(core::OMDataType(input->type()));
154 const int32_t time = OMRuntimeShape(input).dims(0);
155 size_t intermediate_buffer_size = 0;
156 uint8_t *intermediate_buffer = nullptr;
157 if (execute_args.is_train_mode)
158 {
159 const auto num_operators = runtime_context.getCircleOperators()->size();
160
161 uint32_t num_train_layers =
162 execute_args.num_train_layers == 0 ? num_operators : execute_args.num_train_layers;
163 uint32_t last_node_pos = std::min(num_operators, num_train_layers);
164 uint32_t last_train_op_index = num_operators - last_node_pos;
165
166 if (execute_args.kernel_index >= last_train_op_index)
167 {
168 intermediate_buffer_size = num_of_intermediate_tensors * size_of_intermediate_tensors;
169
171 time * intermediate_buffer_size * data_type_size, &intermediate_buffer);
172 if (status != Ok)
173 return status;
174
175 // Save its buffer to state tensor index
176 runtime_storage.saveDataToTensorIndex(intermediate_buffer, state_tensor_index);
177 }
178 }
179
180 switch (input->type())
181 {
182#ifndef DIS_FLOAT
183 case circle::TensorType_FLOAT32:
184 {
185 status =
186 pal::GRU(core::utils::castInputData<float>(input_data),
187 core::utils::castInputData<float>(hidden_input_data),
188 core::utils::castInputData<float>(hidden_hidden_data),
189 core::utils::castInputData<float>(hidden_input_bias_data),
190 core::utils::castInputData<float>(hidden_hidden_bias_data),
191 core::utils::castInputData<float>(state_data),
192 core::utils::castOutputData<float>(output_data),
193 core::utils::castOutputData<float>(output_input_data),
194 core::utils::castOutputData<float>(output_hidden_data),
196 core::OMRuntimeShape(hidden_input), core::OMRuntimeShape(hidden_hidden),
197 intermediate_buffer_size, core::utils::castOutputData<float>(intermediate_buffer));
198 }
199 break;
200#endif // DIS_FLOAT
201 default:
202 {
203 status = UnsupportedType;
204 assert(false && "Unsupported type.");
205 }
206 }
207
210
211 return status;
212}
213
214} // namespace execute
215} // namespace onert_micro
uoffset_t size() const
const reader::CircleOperators * getCircleOperators()
OMStatus saveDataToTensorIndex(uint8_t *data, uint16_t tensor_index)
uint8_t * outputs_data[maxOutputSize]
OMStatus getDataFromStorage(uint16_t op_index, core::OMRuntimeStorage &storage, core::OMRuntimeContext &context)
OMStatus readKernel(uint16_t op_index, core::OMRuntimeContext &runtime_context)
const circle::Tensor * outputs[maxOutputSize]
const circle::Tensor * inputs[maxInputSize]
constexpr uint32_t outputTensorIdx
OMDataType
"scalar" value type
Definition OMDataType.h:35
OMStatus GRU(const float *input_data, const float *weight_input_data, const float *weight_hidden_data, const float *bias_input_data, const float *bias_hidden_data, const float *hidden_state_data, float *output_data, float *output_input_data, float *output_hidden_data, const core::OMRuntimeShape &input_shape, const core::OMRuntimeShape &output_shape, const core::OMRuntimeShape &weight_input_shape, const core::OMRuntimeShape &weight_hidden_shape, const size_t intermediate_buffer_size, float *intermediate_buffer)
OMStatus execute_kernel_CircleGRU(const OMExecuteArgs &execute_args)
Definition GRU.cpp:54
@ UnsupportedType
Definition OMStatus.h:26
static OMStatus deallocateMemory(uint8_t *data)
static OMStatus allocateMemory(uint32_t size, uint8_t **data)
core::OMRuntimeContext & runtime_context
core::OMRuntimeStorage & runtime_storage