ONE/cpu_2ops_2_fully_connected_layer_8cc_source.html

/*

 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved

 *

 * Licensed under the Apache License, Version 2.0 (the "License");

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *      http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


#include "FullyConnectedLayer.h"


#include "GGMLHelper.h"


#include "../Tensor.h"

#include <cker/operation/FullyConnected.h>

#include <cker/TensorUtils.h>

#include <misc/polymorphic_downcast.h>


namespace onert::backend::cpu::ops

{


FullyConnectedLayer::FullyConnectedLayer()

  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),

    _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),

    _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)

{

  // DO NOTHING

}

FullyConnectedLayer::FullyConnectedLayer() {…}


FullyConnectedLayer::~FullyConnectedLayer() = default;


void FullyConnectedLayer::fullyConnectedFloat32()

{

  nnfw::cker::FullyConnectedParams op_params;

  float output_activation_min = 0;

  float output_activation_max = 0;

  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);


  op_params.activation = convertActivationType(_activation);

  op_params.float_activation_min = output_activation_min;

  op_params.float_activation_max = output_activation_max;

  // TODO Set both cachables as false when training

  op_params.lhs_cacheable = _weights->is_constant();

  op_params.rhs_cacheable = _input->is_constant();


  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),

                             getShape(_weights), getBuffer<float>(_weights), getShape(_bias),

                             _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),

                             getBuffer<float>(_output));

}

void FullyConnectedLayer::fullyConnectedFloat32() {…}


// executionMutex is used to protect concurrent access of non-threadsafe resources

// like gemmlowp::GemmContext.


void FullyConnectedLayer::fullyConnectedQuant8()

{

  double real_multiplier = 0.0;

  int32_t output_multiplier = 0;

  int32_t output_shift = 0;

  int32_t output_activation_min = 0;

  int32_t output_activation_max = 0;

  GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier);

  QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);

  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,

                                    &output_activation_max);


  nnfw::cker::FullyConnectedParams op_params;

  op_params.input_offset = -_input->data_zero_point();

  op_params.weights_offset = -_weights->data_zero_point();

  op_params.output_offset = _output->data_zero_point();

  op_params.output_multiplier = output_multiplier;

  op_params.output_shift = output_shift;

  op_params.quantized_activation_min = output_activation_min;

  op_params.quantized_activation_max = output_activation_max;


  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),

                             getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),

                             _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),

                             getBuffer<uint8_t>(_output));

}

void FullyConnectedLayer::fullyConnectedQuant8() {…}


void FullyConnectedLayer::fullyConnectedHybrid()

{

  nnfw::cker::FCTempArena &temp_arena = *_temp_arena;

  if (!temp_arena.prepared)

  {

    temp_arena.prepare(getShape(_input), getShape(_weights));

  }


  nnfw::cker::FullyConnectedParams op_params;

  op_params.activation = convertActivationType(_activation);

  op_params.weights_scale = _weights->data_scale();


#ifndef USE_RUY_GEMV

  nnfw::cker::FullyConnectedHybrid(

    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),

    getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,

    getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());

#else

  nnfw::cker::FullyConnectedHybrid(

    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),

    (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)

                      : getBuffer<int8_t>(_weights),

    getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),

    getBuffer<float>(_output), temp_arena, _external_context->ruy_context());


  if (_cached_weights == nullptr || _is_weights_freed)

    return;


  // '_cached_weights is not nullptr and _is_weights_freed is false' means

  // this weight shape is satisfied with the ruy kernel's prepack cache's condition.

  // After entering here, it will not enter again except below the case - input is zero-vector


  // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)

  // so that handle this case

  const int input_size = getShape(_input).FlatSize();

  if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))

    return;


  auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);


  // This weight tensor could be other ops' const tensor.

  // Therefore, below reference should be checked like following

  auto tensor = const_cast<Tensor *>(weight_tensor);

  if (tensor->buffer() == nullptr) // ref is already 0?

  {

    _is_weights_freed = true;

    return;

  }


  tensor->decrease_ref();

  if (tensor->buffer() == nullptr) // ref == 0?

  {

#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)

    // NOTE This line forces OS to release any unused memory immediately

    mallopt(M_PURGE, 0);

#endif

    _is_weights_freed = true;

  }

#endif

}

void FullyConnectedLayer::fullyConnectedHybrid() {…}


void FullyConnectedLayer::fullyConnectedSparseWeight()

{

  nnfw::cker::FullyConnectedParams op_params;

  op_params.activation = convertActivationType(_activation);


  const uint16_t *w1_segments = _weights->sparsity()->w1_segments();

  const uint16_t *w1_indices = _weights->sparsity()->w1_indices();


  auto block_size = _weights->sparsity()->block_size();

  if (block_size.size() == 0)

  {

    nnfw::cker::FullyConnectedSparseWeightRandom(

      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),

      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,

      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);

  }

  else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)

  {

    nnfw::cker::FullyConnectedSparseWeight16x1(

      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),

      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,

      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);

  }

  else

    throw std::runtime_error{"FullyConnected: unsupported sparsity"};

}

void FullyConnectedLayer::fullyConnectedSparseWeight() {…}


void FullyConnectedLayer::fullyConnectedGGMLWeight()

{

  if (_bias)

    throw std::runtime_error{"FullyConnected: GGML weights format does not support bias yet."};


  // convert tensor

  auto input = getGGMLTensor(_input);

  auto weights = getGGMLTensor(_weights);

  auto output = getGGMLTensor(_output);

  {

    output.op = GGML_OP_MUL_MAT;

    output.src[0] = &weights;

    output.src[1] = &input;

  }

  auto *nodes = &output;


  // create graph

  struct ggml_cgraph graph;

  {

    memset(&graph, 0, sizeof(graph));

    graph.n_nodes = 1;

    graph.nodes = &nodes;

  }


  // get cplan

  auto cplan = ggml_graph_plan(&graph, _external_context->maxNumThreads());

  std::vector<uint8_t> buf(cplan.work_size);

  cplan.work_data = buf.data();


  // compute

  ggml_graph_compute(&graph, &cplan);

}

void FullyConnectedLayer::fullyConnectedGGMLWeight() {…}


void FullyConnectedLayer::fullyConnected16x1Float32()

{

#if defined(__aarch64__) && defined(USE_NEON)

  float output_activation_min = 0, output_activation_max = 0;

  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);


  nnfw::cker::FullyConnectedParams op_params;

  op_params.activation = convertActivationType(_activation);


  nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),

                                        getShape(_weights), getBuffer<float>(_weights),

                                        getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,

                                        getShape(_output), getBuffer<float>(_output));

#else

  throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};

#endif

}

void FullyConnectedLayer::fullyConnected16x1Float32() {…}


void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,

                                    const IPortableTensor *bias, ir::Activation activation,

                                    ir::FullyConnectedWeightsFormat weights_format,

                                    IPortableTensor *output,

                                    const std::shared_ptr<ExternalContext> &external_context)

{

  _input = input;

  _weights = weights;

  _bias = bias;

  _activation = activation;

  _output = output;

  _is_hybrid = input->data_type() == OperandType::FLOAT32 &&

               weights->data_type() == OperandType::QUANT_INT8_SYMM;

  _is_shuffled16x1float32 = weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32;

#if !defined(__aarch64__) || !defined(USE_NEON)

  if (_is_shuffled16x1float32)

  {

    throw std::runtime_error{

      "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};

  }

#endif

  _external_context = external_context;


  if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||

      _weights->data_type() == OperandType::QUANT_GGML_Q8_0)

    _external_context->initGgmlContext();

}

void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights, {…}


void FullyConnectedLayer::run()

{

  if (_is_hybrid)

  {

    fullyConnectedHybrid();

  }

  else if (_weights->sparsity())

  {

    fullyConnectedSparseWeight();

  }

  else if (_weights->data_type() == OperandType::QUANT_GGML_Q4_0 ||

           _weights->data_type() == OperandType::QUANT_GGML_Q8_0)

  {

    fullyConnectedGGMLWeight();

  }

  else if (_input->data_type() == OperandType::FLOAT32)

  {

    _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();

  }

  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)

  {

    fullyConnectedQuant8();

  }

  else

  {

    throw std::runtime_error{"FullyConnected: unsupported data type"};

  }

}

void FullyConnectedLayer::run() {…}


void FullyConnectedLayer::prepare()

{

  if (_bias && _bias->is_constant())

  {

    const int bias_size = getShape(_bias).FlatSize();

    if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))

    {

      _bias = nullptr;

    }

  }


#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)

  // TODO This is workaround

  // The only fc hybrid will use ruy kernel

  if (_input->data_type() != OperandType::FLOAT32 ||

      _weights->data_type() != OperandType::QUANT_INT8_SYMM)

  {

    return;

  }


  // NOTE. The condition to enable caching on ruy kernel can be changed according to ruy's version


  // If input is dynamic, it changes total size of input

  // If weights is not constant, weights cannot be cached

  if (_input->is_dynamic() || !_weights->is_constant())

    return;


  const int rows = getShape(_weights).Dims(0);

  if (rows % 4 == 0)

  {

    // TODO If it's possible to extract precaching from ruy kernel,

    // place this instead of below code


    // buffer will be used by ruy kernel as a cache key

    _cached_weights = _weights->buffer();

  }

#endif

}

void FullyConnectedLayer::prepare() {…}


} // namespace onert::backend::cpu::ops

FusedActivationFunc::NONE
@ NONE

GGMLHelper.h

nnfw::cker::FCTempArena
Definition FullyConnected.h:37

nnfw::cker::FCTempArena::prepared
bool prepared
Definition FullyConnected.h:56

nnfw::cker::FCTempArena::prepare
void prepare(const Shape &input_shape, const Shape &weights_shape)
Definition FullyConnected.h:44

nnfw::cker::Shape::Dims
int32_t Dims(int i) const
Definition Shape.h:110

nnfw::cker::Shape::FlatSize
int FlatSize() const
Definition Shape.h:249

onert::backend::IPortableTensor
A tensor class that is portable for other backends.
Definition IPortableTensor.h:37

onert::backend::IPortableTensor::sparsity
const ir::Sparsity * sparsity() const
Definition IPortableTensor.h:49

onert::backend::IPortableTensor::data_scale
float data_scale() const override final
Definition IPortableTensor.h:55

onert::backend::IPortableTensor::data_zero_point
int32_t data_zero_point() const override final
Definition IPortableTensor.h:56

onert::backend::IPortableTensor::data_type
ir::DataType data_type() const override final
Definition IPortableTensor.h:54

onert::backend::IPortableTensor::is_dynamic
bool is_dynamic() const override final
Return true if the tensor needs dynamic allocation, meaning that during compile-time the outpus shape...
Definition IPortableTensor.h:63

onert::backend::IPortableTensor::is_constant
bool is_constant() const override final
Return true if the tensor is constant.
Definition IPortableTensor.h:62

onert::backend::ITensor::buffer
virtual uint8_t * buffer() const =0

onert::backend::basic::Tensor
Definition Tensor.h:33

onert::backend::cpu::ops::FullyConnectedLayer::_activation
ir::Activation _activation
Definition FullyConnectedLayer.h:68

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnected16x1Float32
void fullyConnected16x1Float32()
Definition FullyConnectedLayer.cc:209

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedSparseWeight
void fullyConnectedSparseWeight()
Definition FullyConnectedLayer.cc:149

onert::backend::cpu::ops::FullyConnectedLayer::_weights
const IPortableTensor * _weights
Definition FullyConnectedLayer.h:64

onert::backend::cpu::ops::FullyConnectedLayer::_bias
const IPortableTensor * _bias
Definition FullyConnectedLayer.h:65

onert::backend::cpu::ops::FullyConnectedLayer::_is_shuffled16x1float32
bool _is_shuffled16x1float32
Definition FullyConnectedLayer.h:74

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedFloat32
void fullyConnectedFloat32()
Definition FullyConnectedLayer.cc:39

onert::backend::cpu::ops::FullyConnectedLayer::_output
IPortableTensor * _output
Definition FullyConnectedLayer.h:66

onert::backend::cpu::ops::FullyConnectedLayer::~FullyConnectedLayer
~FullyConnectedLayer()

onert::backend::cpu::ops::FullyConnectedLayer::_temp_arena
std::unique_ptr< nnfw::cker::FCTempArena > _temp_arena
Definition FullyConnectedLayer.h:69

onert::backend::cpu::ops::FullyConnectedLayer::_is_hybrid
bool _is_hybrid
Definition FullyConnectedLayer.h:73

onert::backend::cpu::ops::FullyConnectedLayer::_external_context
std::shared_ptr< ExternalContext > _external_context
Definition FullyConnectedLayer.h:71

onert::backend::cpu::ops::FullyConnectedLayer::prepare
void prepare() override
Definition FullyConnectedLayer.cc:284

onert::backend::cpu::ops::FullyConnectedLayer::FullyConnectedLayer
FullyConnectedLayer()
Definition FullyConnectedLayer.cc:29

onert::backend::cpu::ops::FullyConnectedLayer::_input
const IPortableTensor * _input
Definition FullyConnectedLayer.h:63

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedHybrid
void fullyConnectedHybrid()
Definition FullyConnectedLayer.cc:88

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedGGMLWeight
void fullyConnectedGGMLWeight()
Definition FullyConnectedLayer.cc:176

onert::backend::cpu::ops::FullyConnectedLayer::configure
void configure(const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, ir::FullyConnectedWeightsFormat weights_format, IPortableTensor *output, const std::shared_ptr< ExternalContext > &external_context)
Definition FullyConnectedLayer.cc:227

onert::backend::cpu::ops::FullyConnectedLayer::fullyConnectedQuant8
void fullyConnectedQuant8()
Definition FullyConnectedLayer.cc:61

onert::backend::cpu::ops::FullyConnectedLayer::run
void run() override
Definition FullyConnectedLayer.cc:255

nnfw::cker::FullyConnectedSparseWeightRandom
void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
Definition FullyConnected.h:250

nnfw::cker::FullyConnectedSparseWeight16x1
void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
Definition FullyConnectedSparse16x1.h:57

nnfw::cker::FullyConnectedHybrid
void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
Definition FullyConnected.h:183

nnfw::cker::FullyConnected
void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)
Definition FullyConnected.h:98

nnfw::cker::IsZeroVector
bool IsZeroVector(const float *vector, int v_size)
Definition TensorUtils.h:104

nnfw
Definition topk_v2.h:30

onert::backend::cpu::ops
Definition AddNLayer.cc:25

onert::backend::cpu::ops::convertActivationType
nnfw::cker::FusedActivationFunctionType convertActivationType(const ir::Activation activation)
Definition OperationUtils.h:106

onert::backend::cpu::ops::getShape
nnfw::cker::Shape getShape(const IPortableTensor *tensor)
Definition OperationUtils.h:89

onert::backend::cpu::ops::getGGMLTensor
struct ggml_tensor getGGMLTensor(const IPortableTensor *tensor)
Definition GGMLHelper.cc:41

onert::backend::cpu::ops::QuantizeMultiplier
void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
Definition OperationUtils.cc:56

onert::backend::cpu::ops::CalculateActivationRangeQuantized
void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output, int32_t *act_min, int32_t *act_max)
Definition OperationUtils.cc:138

onert::backend::cpu::ops::GetQuantizedConvolutionMultiplier
void GetQuantizedConvolutionMultiplier(const IPortableTensor *input, const IPortableTensor *filter, const IPortableTensor *bias, const IPortableTensor *output, double *multiplier)
Definition OperationUtils.cc:77

onert::ir::FullyConnectedWeightsFormat
FullyConnectedWeightsFormat
Definition InternalType.h:48

onert::ir::FullyConnectedWeightsFormat::Shuffled16x1Float32
@ Shuffled16x1Float32

onert::ir::Activation
Activation
Definition InternalType.h:26

onert::util::CalculateActivationRange
void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
Definition CalculateActivationRange.h:28

polymorphic_downcast.h

TensorUtils.h

FullyConnected.h

nnfw::cker::FullyConnectedParams
Definition Types.h:256

nnfw::cker::FullyConnectedParams::output_multiplier
int32_t output_multiplier
Definition Types.h:264

nnfw::cker::FullyConnectedParams::rhs_cacheable
bool rhs_cacheable
Definition Types.h:274

nnfw::cker::FullyConnectedParams::lhs_cacheable
bool lhs_cacheable
Definition Types.h:273

nnfw::cker::FullyConnectedParams::activation
FusedActivationFunctionType activation
Definition Types.h:257

nnfw::cker::FullyConnectedParams::float_activation_max
float float_activation_max
Definition Types.h:271

nnfw::cker::FullyConnectedParams::weights_offset
int32_t weights_offset
Definition Types.h:261

nnfw::cker::FullyConnectedParams::quantized_activation_min
int32_t quantized_activation_min
Definition Types.h:267

nnfw::cker::FullyConnectedParams::float_activation_min
float float_activation_min
Definition Types.h:270

nnfw::cker::FullyConnectedParams::quantized_activation_max
int32_t quantized_activation_max
Definition Types.h:268

nnfw::cker::FullyConnectedParams::input_offset
int32_t input_offset
Definition Types.h:260

nnfw::cker::FullyConnectedParams::output_shift
int output_shift
Definition Types.h:265

nnfw::cker::FullyConnectedParams::output_offset
int32_t output_offset
Definition Types.h:263

nnfw::cker::FullyConnectedParams::weights_scale
float weights_scale
Definition Types.h:262

onert::ir::Sparsity::block_size
const std::vector< int32_t > & block_size() const
Returns block size which is used for block sparsity.
Definition Sparsity.h:51

onert::ir::Sparsity::w1_segments
const uint16_t * w1_segments() const
Returns segments array. See compressed sparse row format.
Definition Sparsity.h:43

onert::ir::Sparsity::w1_indices
const uint16_t * w1_indices() const
Returns indices array. See compressed sparse row format.
Definition Sparsity.h:47

FullyConnectedLayer.h