ONE - On-device Neural Engine
Loading...
Searching...
No Matches
FullyConnectedLayer.cc
Go to the documentation of this file.
1/*
2 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "FullyConnectedLayer.h"
18
19#include "../Tensor.h"
20#include "../KernelGenerator.h"
21#include "../Validator.h"
22
24#include <cker/TensorUtils.h>
26
27namespace onert::backend::cpu
28{
29
30void Validator::visit(const ir::operation::FullyConnected &node)
31{
32 using ir::operation::FullyConnected;
33
34 const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
35 const auto weight_node = &_graph.operands().at(weight_index);
36
37 _supported = false;
38
39 if (weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q4_0 ||
40 weight_node->typeInfo().type() == ir::DataType::QUANT_GGML_Q8_0)
41 return;
42
43 _supported = true;
44}
45
46void KernelGenerator::visit(const ir::operation::FullyConnected &node)
47{
48 using ir::operation::FullyConnected;
49
50 const auto output_index{node.getOutputs().at(0)};
51 const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
52 const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
53 const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
54 const auto activation = node.param().activation;
55 const auto weights_format = node.param().weights_format;
56
57 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
58 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
59 auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
60 auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
61
62 auto fn = std::make_unique<ops::FullyConnectedLayer>();
63
64 fn->configure(input_tensor, weight_tensor, bias_tensor, activation, weights_format, output_tensor,
65 _external_context);
66
67 _return_fn = std::move(fn);
68}
69
70} // namespace onert::backend::cpu
71
73{
74
76 : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
77 _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
78 _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
79{
80 // DO NOTHING
81}
82
84
86{
88 float output_activation_min = 0;
89 float output_activation_max = 0;
90 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
91
93 op_params.float_activation_min = output_activation_min;
94 op_params.float_activation_max = output_activation_max;
95 // TODO Set both cachables as false when training
96 op_params.lhs_cacheable = _weights->is_constant();
97 op_params.rhs_cacheable = _input->is_constant();
98
99 nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
100 getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
101 _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
102 getBuffer<float>(_output));
103}
104
105// executionMutex is used to protect concurrent access of non-threadsafe resources
106// like gemmlowp::GemmContext.
108{
109 double real_multiplier = 0.0;
110 int32_t output_multiplier = 0;
111 int32_t output_shift = 0;
112 int32_t output_activation_min = 0;
113 int32_t output_activation_max = 0;
115 QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
117 &output_activation_max);
118
120 op_params.input_offset = -_input->data_zero_point();
123 op_params.output_multiplier = output_multiplier;
124 op_params.output_shift = output_shift;
125 op_params.quantized_activation_min = output_activation_min;
126 op_params.quantized_activation_max = output_activation_max;
127
128 nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
129 getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
130 _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
131 getBuffer<uint8_t>(_output));
132}
133
135{
137 if (!temp_arena.prepared)
138 {
139 temp_arena.prepare(getShape(_input), getShape(_weights));
140 }
141
144 op_params.weights_scale = _weights->data_scale();
145
146#ifndef USE_RUY_GEMV
148 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
149 getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
150 getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
151#else
153 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
154 (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
155 : getBuffer<int8_t>(_weights),
156 getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
157 getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
158
159 if (_cached_weights == nullptr || _is_weights_freed)
160 return;
161
162 // '_cached_weights is not nullptr and _is_weights_freed is false' means
163 // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
164 // After entering here, it will not enter again except below the case - input is zero-vector
165
166 // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
167 // so that handle this case
168 const int input_size = getShape(_input).FlatSize();
169 if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
170 return;
171
172 auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
173
174 // This weight tensor could be other ops' const tensor.
175 // Therefore, below reference should be checked like following
176 auto tensor = const_cast<Tensor *>(weight_tensor);
177 if (tensor->buffer() == nullptr) // ref is already 0?
178 {
179 _is_weights_freed = true;
180 return;
181 }
182
183 tensor->decrease_ref();
184 if (tensor->buffer() == nullptr) // ref == 0?
185 {
186#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
187 // NOTE This line forces OS to release any unused memory immediately
188 mallopt(M_PURGE, 0);
189#endif
190 _is_weights_freed = true;
191 }
192#endif
193}
194
196{
199
200 const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
201 const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
202
203 auto block_size = _weights->sparsity()->block_size();
204 if (block_size.size() == 0)
205 {
207 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
208 getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
209 getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
210 }
211 else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
212 {
214 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
215 getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
216 getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
217 }
218 else
219 throw std::runtime_error{"FullyConnected: unsupported sparsity"};
220}
221
223{
224#if defined(__aarch64__) && defined(USE_NEON)
225 float output_activation_min = 0, output_activation_max = 0;
226 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
227
230
231 nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
232 getShape(_weights), getBuffer<float>(_weights),
233 getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
234 getShape(_output), getBuffer<float>(_output));
235#else
236 throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
237#endif
238}
239
241 const IPortableTensor *bias, ir::Activation activation,
242 ir::FullyConnectedWeightsFormat weights_format,
243 IPortableTensor *output,
244 const std::shared_ptr<ExternalContext> &external_context)
245{
246 _input = input;
247 _weights = weights;
248 _bias = bias;
249 _activation = activation;
250 _output = output;
251 _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
252 weights->data_type() == OperandType::QUANT_INT8_SYMM;
254#if !defined(__aarch64__) || !defined(USE_NEON)
256 {
257 throw std::runtime_error{
258 "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
259 }
260#endif
261 _external_context = external_context;
262}
263
265{
266 if (_is_hybrid)
267 {
269 }
270 else if (_weights->sparsity())
271 {
273 }
274 else if (_input->data_type() == OperandType::FLOAT32)
275 {
277 }
278 else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
279 {
281 }
282 else
283 {
284 throw std::runtime_error{"FullyConnected: unsupported data type"};
285 }
286}
287
289{
290 if (_bias && _bias->is_constant())
291 {
292 const int bias_size = getShape(_bias).FlatSize();
293 if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
294 {
295 _bias = nullptr;
296 }
297 }
298
299#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)
300 // TODO This is workaround
301 // The only fc hybrid will use ruy kernel
302 if (_input->data_type() != OperandType::FLOAT32 ||
303 _weights->data_type() != OperandType::QUANT_INT8_SYMM)
304 {
305 return;
306 }
307
308 // NOTE. The condition to enable caching on ruy kernel can be changed according to ruy's version
309
310 // If input is dynamic, it changes total size of input
311 // If weights is not constant, weights cannot be cached
313 return;
314
315 const int rows = getShape(_weights).Dims(0);
316 if (rows % 4 == 0)
317 {
318 // TODO If it's possible to extract precaching from ruy kernel,
319 // place this instead of below code
320
321 // buffer will be used by ruy kernel as a cache key
322 _cached_weights = _weights->buffer();
323 }
324#endif
325}
326
327} // namespace onert::backend::cpu::ops
void prepare(const Shape &input_shape, const Shape &weights_shape)
int32_t Dims(int i) const
Definition Shape.h:106
int FlatSize() const
Definition Shape.h:256
A tensor class that is portable for other backends.
const ir::Sparsity * sparsity() const
float data_scale() const override final
int32_t data_zero_point() const override final
ir::DataType data_type() const override final
bool is_dynamic() const override final
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
bool is_constant() const override final
Return true if the tensor is constant.
virtual uint8_t * buffer() const =0
std::unique_ptr< exec::IFunction > _return_fn
std::unique_ptr< nnfw::cker::FCTempArena > _temp_arena
std::shared_ptr< ExternalContext > _external_context
void configure(const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, ir::FullyConnectedWeightsFormat weights_format, IPortableTensor *output, const std::shared_ptr< ExternalContext > &external_context)
const Operands & operands() const override
Definition Graph.h:103
const Object & at(const Index &index) const
Get the object that is associated with the given index.
void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)
bool IsZeroVector(const float *vector, int v_size)
Definition topk_v2.h:30
nnfw::cker::FusedActivationFunctionType convertActivationType(const ir::Activation activation)
nnfw::cker::Shape getShape(const IPortableTensor *tensor)
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output, int32_t *act_min, int32_t *act_max)
void GetQuantizedConvolutionMultiplier(const IPortableTensor *input, const IPortableTensor *filter, const IPortableTensor *bias, const IPortableTensor *output, double *multiplier)
FullyConnectedWeightsFormat
void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
CLTensor bias_tensor
FusedActivationFunctionType activation
Definition Types.h:257
const std::vector< int32_t > & block_size() const
Returns block size which is used for block sparsity.
Definition Sparsity.h:51
const uint16_t * w1_segments() const
Returns segments array. See compressed sparse row format.
Definition Sparsity.h:43
const uint16_t * w1_indices() const
Returns indices array. See compressed sparse row format.
Definition Sparsity.h:47