#include <FullyConnectedLayer.h>

Collaboration diagram for onert::backend::cpu::ops::FullyConnectedLayer:

[legend]

Public Member Functions
	FullyConnectedLayer ()

	~FullyConnectedLayer ()

void	fullyConnectedFloat32 ()

void	fullyConnectedQuant8 ()

void	fullyConnectedHybrid ()

void	fullyConnectedSparseWeight ()

void	fullyConnectedGGMLWeight ()

void	fullyConnected16x1Float32 ()

void	configure (const IPortableTensor input, const IPortableTensor weights, const IPortableTensor bias, ir::Activation activation, ir::FullyConnectedWeightsFormat weights_format, IPortableTensor output, const std::shared_ptr< ExternalContext > &external_context)

void	run () override

void	prepare () override

Public Member Functions inherited from onert::exec::IFunction
virtual	~IFunction ()=default

Protected Attributes
const IPortableTensor *	_input

const IPortableTensor *	_weights

const IPortableTensor *	_bias

IPortableTensor *	_output

ir::Activation	_activation

std::unique_ptr< nnfw::cker::FCTempArena >	_temp_arena

std::shared_ptr< ExternalContext >	_external_context

bool	_is_hybrid: 1

bool	_is_shuffled16x1float32: 1

Detailed Description

Definition at line 34 of file FullyConnectedLayer.h.

Constructor & Destructor Documentation

◆ FullyConnectedLayer()

onert::backend::cpu::ops::FullyConnectedLayer::FullyConnectedLayer ( )

Definition at line 29 of file FullyConnectedLayer.cc.

  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
    _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
    _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
{
  // DO NOTHING
}

◆ ~FullyConnectedLayer()

onert::backend::cpu::ops::FullyConnectedLayer::~FullyConnectedLayer ( )

default

Member Function Documentation

◆ configure()

void onert::backend::cpu::ops::FullyConnectedLayer::configure	(	const IPortableTensor *	input,
		const IPortableTensor *	weights,
		const IPortableTensor *	bias,
		ir::Activation	activation,
		ir::FullyConnectedWeightsFormat	weights_format,
		IPortableTensor *	output,
		const std::shared_ptr< ExternalContext > &	external_context
	)

Definition at line 227 of file FullyConnectedLayer.cc.

{
  _input = input;
  _weights = weights;
  _bias = bias;
  _activation = activation;
  _output = output;
  _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
               weights->data_type() == OperandType::QUANT_INT8_SYMM;
  _is_shuffled16x1float32 = weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32;
#if !defined(__aarch64__) || !defined(USE_NEON)
  if (_is_shuffled16x1float32)
  {
    throw std::runtime_error{
      "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
  }
#endif
  _external_context = external_context;
 
  if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
      _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
    _external_context->initGgmlContext();
}

References _activation, _bias, _external_context, _input, _is_hybrid, _is_shuffled16x1float32, _output, _weights, onert::backend::IPortableTensor::data_type(), and onert::ir::Shuffled16x1Float32.

◆ fullyConnected16x1Float32()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnected16x1Float32 ( )

Definition at line 209 of file FullyConnectedLayer.cc.

{
#if defined(__aarch64__) && defined(USE_NEON)
  float output_activation_min = 0, output_activation_max = 0;
  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
 
  nnfw::cker::FullyConnectedParams op_params;
  op_params.activation = convertActivationType(_activation);
 
  nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
                                        getShape(_weights), getBuffer<float>(_weights),
                                        getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
                                        getShape(_output), getBuffer<float>(_output));
#else
  throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
#endif
}

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::util::CalculateActivationRange(), onert::backend::cpu::ops::convertActivationType(), and onert::backend::cpu::ops::getShape().

Referenced by run().

◆ fullyConnectedFloat32()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedFloat32 ( )

Definition at line 39 of file FullyConnectedLayer.cc.

{
  nnfw::cker::FullyConnectedParams op_params;
  float output_activation_min = 0;
  float output_activation_max = 0;
  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
 
  op_params.activation = convertActivationType(_activation);
  op_params.float_activation_min = output_activation_min;
  op_params.float_activation_max = output_activation_max;
  // TODO Set both cachables as false when training
  op_params.lhs_cacheable = _weights->is_constant();
  op_params.rhs_cacheable = _input->is_constant();
 
  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
                             getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
                             _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
                             getBuffer<float>(_output));
}

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::util::CalculateActivationRange(), onert::backend::cpu::ops::convertActivationType(), nnfw::cker::FullyConnectedParams::float_activation_max, nnfw::cker::FullyConnectedParams::float_activation_min, nnfw::cker::FullyConnected(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::is_constant(), nnfw::cker::FullyConnectedParams::lhs_cacheable, and nnfw::cker::FullyConnectedParams::rhs_cacheable.

Referenced by run().

◆ fullyConnectedGGMLWeight()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedGGMLWeight ( )

Definition at line 176 of file FullyConnectedLayer.cc.

{
  if (_bias)
    throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};
 
  // convert tensor
  auto input = getGGMLTensor(_input);
  auto weights = getGGMLTensor(_weights);
  auto output = getGGMLTensor(_output);
  {
    output.op = GGML_OP_MUL_MAT;
    output.src[0] = &weights;
    output.src[1] = &input;
  }
  auto *nodes = &output;
 
  // create graph
  struct ggml_cgraph graph;
  {
    memset(&graph, 0, sizeof(graph));
    graph.n_nodes = 1;
    graph.nodes = &nodes;
  }
 
  // get cplan
  auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());
  std::vector<uint8_t> buf(cplan.work_size);
  cplan.work_data = buf.data();
 
  // compute
  ggml_graph_compute(&graph, &cplan);
}

References _bias, _external_context, _input, _output, _weights, and onert::backend::cpu::ops::getGGMLTensor().

Referenced by run().

◆ fullyConnectedHybrid()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid ( )

Definition at line 88 of file FullyConnectedLayer.cc.

{
  nnfw::cker::FCTempArena &temp_arena = *_temp_arena;
  if (!temp_arena.prepared)
  {
    temp_arena.prepare(getShape(_input), getShape(_weights));
  }
 
  nnfw::cker::FullyConnectedParams op_params;
  op_params.activation = convertActivationType(_activation);
  op_params.weights_scale = _weights->data_scale();
 
#ifndef USE_RUY_GEMV
  nnfw::cker::FullyConnectedHybrid(
    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
    getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
    getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
#else
  nnfw::cker::FullyConnectedHybrid(
    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
    (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
                      : getBuffer<int8_t>(_weights),
    getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
    getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 
  if (_cached_weights == nullptr || _is_weights_freed)
    return;
 
  // '_cached_weights is not nullptr and _is_weights_freed is false' means
  // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
  // After entering here, it will not enter again except below the case - input is zero-vector
 
  // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
  // so that handle this case
  const int input_size = getShape(_input).FlatSize();
  if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
    return;
 
  auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
 
  // This weight tensor could be other ops' const tensor.
  // Therefore, below reference should be checked like following
  auto tensor = const_cast<Tensor *>(weight_tensor);
  if (tensor->buffer() == nullptr) // ref is already 0?
  {
    _is_weights_freed = true;
    return;
  }
 
  tensor->decrease_ref();
  if (tensor->buffer() == nullptr) // ref == 0?
  {
#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
    // NOTE This line forces OS to release any unused memory immediately
    mallopt(M_PURGE, 0);
#endif
    _is_weights_freed = true;
  }
#endif
}

References _activation, _bias, _external_context, _input, _output, _temp_arena, _weights, nnfw::cker::FullyConnectedParams::activation, onert::backend::cpu::ops::convertActivationType(), onert::backend::IPortableTensor::data_scale(), nnfw::cker::Shape::FlatSize(), nnfw::cker::FullyConnectedHybrid(), onert::backend::cpu::ops::getShape(), nnfw::cker::IsZeroVector(), nnfw::cker::FCTempArena::prepare(), nnfw::cker::FCTempArena::prepared, and nnfw::cker::FullyConnectedParams::weights_scale.

Referenced by run().

◆ fullyConnectedQuant8()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedQuant8 ( )

Definition at line 61 of file FullyConnectedLayer.cc.

{
  double real_multiplier = 0.0;
  int32_t output_multiplier = 0;
  int32_t output_shift = 0;
  int32_t output_activation_min = 0;
  int32_t output_activation_max = 0;
  GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier);
  QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
                                    &output_activation_max);
 
  nnfw::cker::FullyConnectedParams op_params;
  op_params.input_offset = -_input->data_zero_point();
  op_params.weights_offset = -_weights->data_zero_point();
  op_params.output_offset = _output->data_zero_point();
  op_params.output_multiplier = output_multiplier;
  op_params.output_shift = output_shift;
  op_params.quantized_activation_min = output_activation_min;
  op_params.quantized_activation_max = output_activation_max;
 
  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
                             getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
                             _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
                             getBuffer<uint8_t>(_output));
}

References _activation, _bias, _input, _output, _weights, onert::backend::cpu::ops::CalculateActivationRangeQuantized(), onert::backend::IPortableTensor::data_zero_point(), nnfw::cker::FullyConnected(), onert::backend::cpu::ops::GetQuantizedConvolutionMultiplier(), onert::backend::cpu::ops::getShape(), nnfw::cker::FullyConnectedParams::input_offset, nnfw::cker::FullyConnectedParams::output_multiplier, nnfw::cker::FullyConnectedParams::output_offset, nnfw::cker::FullyConnectedParams::output_shift, nnfw::cker::FullyConnectedParams::quantized_activation_max, nnfw::cker::FullyConnectedParams::quantized_activation_min, onert::backend::cpu::ops::QuantizeMultiplier(), and nnfw::cker::FullyConnectedParams::weights_offset.

Referenced by run().

◆ fullyConnectedSparseWeight()

void onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight ( )

Definition at line 149 of file FullyConnectedLayer.cc.

{
  nnfw::cker::FullyConnectedParams op_params;
  op_params.activation = convertActivationType(_activation);
 
  const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
  const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
 
  auto block_size = _weights->sparsity()->block_size();
  if (block_size.size() == 0)
  {
    nnfw::cker::FullyConnectedSparseWeightRandom(
      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
  }
  else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
  {
    nnfw::cker::FullyConnectedSparseWeight16x1(
      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
  }
  else
    throw std::runtime_error{"FullyConnected: unsupported sparsity"};
}

References _activation, _bias, _input, _output, _weights, nnfw::cker::FullyConnectedParams::activation, onert::ir::Sparsity::block_size(), onert::backend::cpu::ops::convertActivationType(), nnfw::cker::FullyConnectedSparseWeight16x1(), nnfw::cker::FullyConnectedSparseWeightRandom(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::sparsity(), onert::ir::Sparsity::w1_indices(), and onert::ir::Sparsity::w1_segments().

Referenced by run().

◆ prepare()

void onert::backend::cpu::ops::FullyConnectedLayer::prepare ( )

overridevirtual

Reimplemented from onert::exec::IFunction.

Definition at line 284 of file FullyConnectedLayer.cc.

{
  if (_bias && _bias->is_constant())
  {
    const int bias_size = getShape(_bias).FlatSize();
    if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
    {
      _bias = nullptr;
    }
  }
 
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)
  // TODO This is workaround
  // The only fc hybrid will use ruy kernel
  if (_input->data_type() != OperandType::FLOAT32 ||
      _weights->data_type() != OperandType::QUANT_INT8_SYMM)
  {
    return;
  }
 
  // NOTE. The condition to enable caching on ruy kernel can be changed according to ruy's version
 
  // If input is dynamic, it changes total size of input
  // If weights is not constant, weights cannot be cached
  if (_input->is_dynamic() || !_weights->is_constant())
    return;
 
  const int rows = getShape(_weights).Dims(0);
  if (rows % 4 == 0)
  {
    // TODO If it's possible to extract precaching from ruy kernel,
    // place this instead of below code
 
    // buffer will be used by ruy kernel as a cache key
    _cached_weights = _weights->buffer();
  }
#endif
}

References _bias, _input, _weights, onert::backend::ITensor::buffer(), onert::backend::IPortableTensor::data_type(), nnfw::cker::Shape::Dims(), nnfw::cker::Shape::FlatSize(), onert::backend::cpu::ops::getShape(), onert::backend::IPortableTensor::is_constant(), onert::backend::IPortableTensor::is_dynamic(), and nnfw::cker::IsZeroVector().

◆ run()

void onert::backend::cpu::ops::FullyConnectedLayer::run ( )

overridevirtual

Implements onert::exec::IFunction.

Definition at line 255 of file FullyConnectedLayer.cc.

{
  if (_is_hybrid)
  {
    fullyConnectedHybrid();
  }
  else if (_weights->sparsity())
  {
    fullyConnectedSparseWeight();
  }
  else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||
           _weights->data_type() == OperandType::QUANT_GGML_Q8_0)
  {
    fullyConnectedGGMLWeight();
  }
  else if (_input->data_type() == OperandType::FLOAT32)
  {
    _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
  }
  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
  {
    fullyConnectedQuant8();
  }
  else
  {
    throw std::runtime_error{"FullyConnected: unsupported data type"};
  }
}

References _input, _is_hybrid, _is_shuffled16x1float32, _weights, onert::backend::IPortableTensor::data_type(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), fullyConnectedSparseWeight(), and onert::backend::IPortableTensor::sparsity().

Referenced by onert::backend::train::ops::FullyConnectedLayer::forward().

Field Documentation

◆ _activation

ir::Activation onert::backend::cpu::ops::FullyConnectedLayer::_activation

protected

Definition at line 68 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedHybrid(), fullyConnectedQuant8(), and fullyConnectedSparseWeight().

◆ _bias

const IPortableTensor* onert::backend::cpu::ops::FullyConnectedLayer::_bias

protected

Definition at line 65 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), fullyConnectedSparseWeight(), and prepare().

◆ _external_context

std::shared_ptr<ExternalContext> onert::backend::cpu::ops::FullyConnectedLayer::_external_context

protected

Definition at line 71 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnectedGGMLWeight(), and fullyConnectedHybrid().

◆ _input

const IPortableTensor* onert::backend::cpu::ops::FullyConnectedLayer::_input

protected

Definition at line 63 of file FullyConnectedLayer.h.

Referenced by onert::backend::train::ops::FullyConnectedLayer::backward(), configure(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), fullyConnectedSparseWeight(), prepare(), and run().

◆ _is_hybrid

bool onert::backend::cpu::ops::FullyConnectedLayer::_is_hybrid

protected

Definition at line 73 of file FullyConnectedLayer.h.

Referenced by configure(), and run().

◆ _is_shuffled16x1float32

bool onert::backend::cpu::ops::FullyConnectedLayer::_is_shuffled16x1float32

protected

Definition at line 74 of file FullyConnectedLayer.h.

Referenced by configure(), and run().

◆ _output

IPortableTensor* onert::backend::cpu::ops::FullyConnectedLayer::_output

protected

Definition at line 66 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), and fullyConnectedSparseWeight().

◆ _temp_arena

std::unique_ptr<nnfw::cker::FCTempArena> onert::backend::cpu::ops::FullyConnectedLayer::_temp_arena

protected

Definition at line 69 of file FullyConnectedLayer.h.

Referenced by fullyConnectedHybrid().

◆ _weights

const IPortableTensor* onert::backend::cpu::ops::FullyConnectedLayer::_weights

protected

Definition at line 64 of file FullyConnectedLayer.h.

Referenced by configure(), fullyConnected16x1Float32(), fullyConnectedFloat32(), fullyConnectedGGMLWeight(), fullyConnectedHybrid(), fullyConnectedQuant8(), fullyConnectedSparseWeight(), prepare(), and run().

The documentation for this class was generated from the following files:

runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc

Public Member Functions

Protected Attributes

Detailed Description

Constructor & Destructor Documentation

◆ FullyConnectedLayer()

◆ ~FullyConnectedLayer()

Member Function Documentation

◆ configure()

◆ fullyConnected16x1Float32()

◆ fullyConnectedFloat32()

◆ fullyConnectedGGMLWeight()

◆ fullyConnectedHybrid()

◆ fullyConnectedQuant8()

◆ fullyConnectedSparseWeight()

◆ prepare()

◆ run()

Field Documentation

◆ _activation

◆ _bias

◆ _external_context

◆ _input

◆ _is_hybrid

◆ _is_shuffled16x1float32

◆ _output

◆ _temp_arena

◆ _weights