ONE - On-device Neural Engine
Loading...
Searching...
No Matches
onert::backend::cpu::ops::FullyConnectedLayer Class Reference

#include <FullyConnectedLayer.h>

Collaboration diagram for onert::backend::cpu::ops::FullyConnectedLayer:

Public Member Functions

 FullyConnectedLayer ()
 
 ~FullyConnectedLayer ()
 
void fullyConnectedFloat32 ()
 
void fullyConnectedQuant8 ()
 
void fullyConnectedHybrid ()
 
void fullyConnectedSparseWeight ()
 
void fullyConnectedGGMLWeight ()
 
void fullyConnected16x1Float32 ()
 
void configure (const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, ir::FullyConnectedWeightsFormat weights_format, IPortableTensor *output, const std::shared_ptr< ExternalContext > &external_context)
 
void run () override
 
void prepare () override
 
- Public Member Functions inherited from onert::exec::IFunction
virtual ~IFunction ()=default
 

Protected Attributes

const IPortableTensor_input
 
const IPortableTensor_weights
 
const IPortableTensor_bias
 
IPortableTensor_output
 
ir::Activation _activation
 
std::unique_ptr< nnfw::cker::FCTempArena_temp_arena
 
std::shared_ptr< ExternalContext_external_context
 
bool _is_hybrid: 1
 
bool _is_shuffled16x1float32: 1
 

Detailed Description

Definition at line 43 of file FullyConnectedLayer.h.

Constructor & Destructor Documentation

◆ FullyConnectedLayer()

onert::backend::cpu::ops::FullyConnectedLayer::FullyConnectedLayer ( )

Definition at line 35 of file FullyConnectedLayer.cc.

◆ ~FullyConnectedLayer()

onert::backend::cpu::ops::FullyConnectedLayer::~FullyConnectedLayer ( )
default

Member Function Documentation

◆ configure()

void onert::backend::cpu::ops::FullyConnectedLayer::configure ( const IPortableTensor input,
const IPortableTensor weights,
const IPortableTensor bias,
ir::Activation  activation,
ir::FullyConnectedWeightsFormat  weights_format,
IPortableTensor output,
const std::shared_ptr< ExternalContext > &  external_context 
)

Definition at line 233 of file FullyConnectedLayer.cc.

238{
239 _input = input;
240 _weights = weights;
241 _bias = bias;
242 _activation = activation;
243 _output = output;
244 _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
245 weights->data_type() == OperandType::QUANT_INT8_SYMM;
247#if !defined(__aarch64__) || !defined(USE_NEON)
249 {
250 throw std::runtime_error{
251 "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
252 }
253#endif
254 _external_context = external_context;
255
256 if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
257 _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
258 _external_context->initGgmlContext();
259}
ir::DataType data_type() const override final

References _activation, _bias, _external_context, _input, _is_hybrid, _is_shuffled16x1float32, _output, _weights, onert::backend::IPortableTensor::data_type(), and onert::ir::Shuffled16x1Float32.

◆ fullyConnected16x1Float32()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnected16x1Float32 ( )

Definition at line 215 of file FullyConnectedLayer.cc.

216{
217#if defined(__aarch64__) && defined(USE_NEON)
218 float output_activation_min = 0, output_activation_max = 0;
219 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
220
223
224 nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
225 getShape(_weights), getBuffer<float>(_weights),
226 getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
228#else
229 throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
230#endif
231}
const T * getBuffer(const IPortableTensor *tensor)
nnfw::cker::FusedActivationFunctionType convertActivationType(const ir::Activation activation)
nnfw::cker::Shape getShape(const IPortableTensor *tensor)
void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
FusedActivationFunctionType activation
Definition Types.h:257

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::util::CalculateActivationRange(), onert::backend::cpu::ops::convertActivationType(), and onert::backend::cpu::ops::getShape().

Referenced by run().

◆ fullyConnectedFloat32()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedFloat32 ( )

Definition at line 45 of file FullyConnectedLayer.cc.

46{
48 float output_activation_min = 0;
49 float output_activation_max = 0;
50 CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
51
53 op_params.float_activation_min = output_activation_min;
54 op_params.float_activation_max = output_activation_max;
55 // TODO Set both cachables as false when training
56 op_params.lhs_cacheable = _weights->is_constant();
57 op_params.rhs_cacheable = _input->is_constant();
58
59 nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
60 getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
61 _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
62 getBuffer<float>(_output));
63}
bool is_constant() const override final
Return true if the tensor is constant.
void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::util::CalculateActivationRange(), onert::backend::cpu::ops::convertActivationType(), nnfw::cker::FullyConnectedParams::float_activation_max, nnfw::cker::FullyConnectedParams::float_activation_min, nnfw::cker::FullyConnected(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::is_constant(), nnfw::cker::FullyConnectedParams::lhs_cacheable, and nnfw::cker::FullyConnectedParams::rhs_cacheable.

Referenced by run().

◆ fullyConnectedGGMLWeight()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedGGMLWeight ( )

Definition at line 182 of file FullyConnectedLayer.cc.

183{
184 if (_bias)
185 throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
186
187 // convert tensor
188 auto input = getGGMLTensor(_input);
189 auto weights = getGGMLTensor(_weights);
191 {
192 output.op = GGML_OP_MUL_MAT;
193 output.src[0] = &weights;
194 output.src[1] = &input;
195 }
196 auto *nodes = &output;
197
198 // create graph
199 struct ggml_cgraph graph;
200 {
201 memset(&graph, 0, sizeof(graph));
202 graph.n_nodes = 1;
203 graph.nodes = &nodes;
204 }
205
206 // get cplan
207 auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
208 std::vector<uint8_t> buf(cplan.work_size);
209 cplan.work_data = buf.data();
210
211 // compute
212 ggml_graph_compute(&graph, &cplan);
213}
struct ggml_tensor getGGMLTensor(const IPortableTensor *tensor)
Definition GGMLHelper.cc:47

References _bias, _external_context, _input, _output, _weights, and onert::backend::cpu::ops::getGGMLTensor().

Referenced by run().

◆ fullyConnectedHybrid()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid ( )

Definition at line 94 of file FullyConnectedLayer.cc.

95{
97 if (!temp_arena.prepared)
98 {
100 }
101
104 op_params.weights_scale = _weights->data_scale();
105
106#ifndef USE_RUY_GEMV
108 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
109 getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
110 getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
111#else
113 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
114 (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
115 : getBuffer<int8_t>(_weights),
116 getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
117 getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
118
119 if (_cached_weights == nullptr || _is_weights_freed)
120 return;
121
122 // '_cached_weights is not nullptr and _is_weights_freed is false' means
123 // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
124 // After entering here, it will not enter again except below the case - input is zero-vector
125
126 // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
127 // so that handle this case
128 const int input_size = getShape(_input).FlatSize();
129 if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
130 return;
131
132 auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
133
134 // This weight tensor could be other ops' const tensor.
135 // Therefore, below reference should be checked like following
136 auto tensor = const_cast<Tensor *>(weight_tensor);
137 if (tensor->buffer() == nullptr) // ref is already 0?
138 {
139 _is_weights_freed = true;
140 return;
141 }
142
143 tensor->decrease_ref();
144 if (tensor->buffer() == nullptr) // ref == 0?
145 {
146#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
147 // NOTE This line forces OS to release any unused memory immediately
148 mallopt(M_PURGE, 0);
149#endif
150 _is_weights_freed = true;
151 }
152#endif
153}
void prepare(const Shape &input_shape, const Shape &weights_shape)
int FlatSize() const
Definition Shape.h:181
float data_scale() const override final
void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
bool IsZeroVector(const float *vector, int v_size)

References _activation, _bias, _external_context, _input, _output, _temp_arena, _weights, nnfw::cker::FullyConnectedParams::activation, onert::backend::cpu::ops::convertActivationType(), onert::backend::IPortableTensor::data_scale(), nnfw::cker::Shape::FlatSize(), nnfw::cker::FullyConnectedHybrid(), onert::backend::cpu::ops::getShape(), nnfw::cker::IsZeroVector(), nnfw::cker::FCTempArena::prepare(), nnfw::cker::FCTempArena::prepared, and nnfw::cker::FullyConnectedParams::weights_scale.

Referenced by run().

◆ fullyConnectedQuant8()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedQuant8 ( )

Definition at line 67 of file FullyConnectedLayer.cc.

68{
69 double real_multiplier = 0.0;
70 int32_t output_multiplier = 0;
71 int32_t output_shift = 0;
72 int32_t output_activation_min = 0;
73 int32_t output_activation_max = 0;
75 QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
77 &output_activation_max);
78
80 op_params.input_offset = -_input->data_zero_point();
83 op_params.output_multiplier = output_multiplier;
84 op_params.output_shift = output_shift;
85 op_params.quantized_activation_min = output_activation_min;
86 op_params.quantized_activation_max = output_activation_max;
87
88 nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
89 getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
90 _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
91 getBuffer<uint8_t>(_output));
92}
int32_t data_zero_point() const override final
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output, int32_t *act_min, int32_t *act_max)
void GetQuantizedConvolutionMultiplier(const IPortableTensor *input, const IPortableTensor *filter, const IPortableTensor *bias, const IPortableTensor *output, double *multiplier)

References _activation, _bias, _input, _output, _weights, onert::backend::cpu::ops::CalculateActivationRangeQuantized(), onert::backend::IPortableTensor::data_zero_point(), nnfw::cker::FullyConnected(), onert::backend::cpu::ops::GetQuantizedConvolutionMultiplier(), onert::backend::cpu::ops::getShape(), nnfw::cker::FullyConnectedParams::input_offset, nnfw::cker::FullyConnectedParams::output_multiplier, nnfw::cker::FullyConnectedParams::output_offset, nnfw::cker::FullyConnectedParams::output_shift, nnfw::cker::FullyConnectedParams::quantized_activation_max, nnfw::cker::FullyConnectedParams::quantized_activation_min, onert::backend::cpu::ops::QuantizeMultiplier(), and nnfw::cker::FullyConnectedParams::weights_offset.

Referenced by run().

◆ fullyConnectedSparseWeight()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight ( )

Definition at line 155 of file FullyConnectedLayer.cc.

156{
159
160 const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
161 const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
162
163 auto block_size = _weights->sparsity()->block_size();
164 if (block_size.size() == 0)
165 {
167 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
168 getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
169 getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
170 }
171 else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
172 {
174 op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
175 getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
176 getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
177 }
178 else
179 throw std::runtime_error{"FullyConnected: unsupported sparsity"};
180}
const ir::Sparsity * sparsity() const
void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
const std::vector< int32_t > & block_size() const
Returns block size which is used for block sparsity.
Definition Sparsity.h:53
const uint16_t * w1_segments() const
Returns segments array. See compressed sparse row format.
Definition Sparsity.h:45
const uint16_t * w1_indices() const
Returns indices array. See compressed sparse row format.
Definition Sparsity.h:49

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::ir::Sparsity::block_size(), onert::backend::cpu::ops::convertActivationType(), nnfw::cker::FullyConnectedSparseWeight16x1(), nnfw::cker::FullyConnectedSparseWeightRandom(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::sparsity(), onert::ir::Sparsity::w1_indices(), and onert::ir::Sparsity::w1_segments().

Referenced by run().

◆ prepare()

void onert::backend::cpu::ops::FullyConnectedLayer::prepare ( )
overridevirtual

Reimplemented from onert::exec::IFunction.

Definition at line 290 of file FullyConnectedLayer.cc.

291{
292 if (_bias && _bias->is_constant())
293 {
294 const int bias_size = getShape(_bias).FlatSize();
295 if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
296 {
297 _bias = nullptr;
298 }
299 }
300
301#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)
302 // TODO This is workaround
303 // The only fc hybrid will use ruy kernel
304 if (_input->data_type() != OperandType::FLOAT32 ||
305 _weights->data_type() != OperandType::QUANT_INT8_SYMM)
306 {
307 return;
308 }
309
310 // NOTE. The condition to enable caching on ruy kernel can be changed according to ruy's version
311
312 // If input is dynamic, it changes total size of input
313 // If weights is not constant, weights cannot be cached
315 return;
316
317 const int rows = getShape(_weights).Dims(0);
318 if (rows % 4 == 0)
319 {
320 // TODO If it's possible to extract precaching from ruy kernel,
321 // place this instead of below code
322
323 // buffer will be used by ruy kernel as a cache key
324 _cached_weights = _weights->buffer();
325 }
326#endif
327}
int32_t Dims(int i) const
Definition Shape.h:92
bool is_dynamic() const override final
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
virtual uint8_t * buffer() const =0

References _bias, _input, _weights, onert::backend::ITensor::buffer(), onert::backend::IPortableTensor::data_type(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::is_constant(), onert::backend::IPortableTensor::is_dynamic(), and nnfw::cker::IsZeroVector().

◆ run()

void onert::backend::cpu::ops::FullyConnectedLayer::run ( )
overridevirtual

Implements onert::exec::IFunction.

Definition at line 261 of file FullyConnectedLayer.cc.

262{
263 if (_is_hybrid)
264 {
266 }
267 else if (_weights->sparsity())
268 {
270 }
271 else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
272 _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
273 {
275 }
276 else if (_input->data_type() == OperandType::FLOAT32)
277 {
279 }
280 else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
281 {
283 }
284 else
285 {
286 throw std::runtime_error{"FullyConnected: unsupported data type"};
287 }
288}

References _input, _is_hybrid, _is_shuffled16x1float32, _weights, onert::backend::IPortableTensor::data_type(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), fullyConnectedSparseWeight(), and onert::backend::IPortableTensor::sparsity().

Referenced by onert::backend::train::ops::FullyConnectedLayer::forward(), and package.infer.session::inference().

Field Documentation

◆ _activation

ir::Activation onert::backend::cpu::ops::FullyConnectedLayer::_activation
protected

◆ _bias

◆ _external_context

std::shared_ptr<ExternalContext> onert::backend::cpu::ops::FullyConnectedLayer::_external_context
protected

Definition at line 80 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnectedGGMLWeight(), and fullyConnectedHybrid().

◆ _input

◆ _is_hybrid

bool onert::backend::cpu::ops::FullyConnectedLayer::_is_hybrid
protected

Definition at line 82 of file FullyConnectedLayer.h.

Referenced by configure(), and run().

◆ _is_shuffled16x1float32

bool onert::backend::cpu::ops::FullyConnectedLayer::_is_shuffled16x1float32
protected

Definition at line 83 of file FullyConnectedLayer.h.

Referenced by configure(), and run().

◆ _output

◆ _temp_arena

std::unique_ptr<nnfw::cker::FCTempArena> onert::backend::cpu::ops::FullyConnectedLayer::_temp_arena
protected

Definition at line 78 of file FullyConnectedLayer.h.

Referenced by fullyConnectedHybrid().

◆ _weights

const IPortableTensor* onert::backend::cpu::ops::FullyConnectedLayer::_weights
protected

The documentation for this class was generated from the following files: