18#ifndef __NNFW_CKER_FULLY_CONNECTED_H__
19#define __NNFW_CKER_FULLY_CONNECTED_H__
21#include <ruy/context.h>
46 auto input_size = input_shape.
FlatSize();
50 int batch_size = input_size / weights_shape.
Dims(1);
62#if defined(CKER_X86_PLATFORM)
66 const float *input_data,
const Shape &weights_shape,
67 const float *weights_data,
const Shape &,
72 const int input_rows = weights_shape.
Dims(dims_count - 1);
75 rhs_params.
rows = input_rows;
81 lhs_params.
cols = weights_shape.
Dims(dims_count - 1);
89 gemm_params.
bias = optional_bias_data;
92 optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
99 const float *input_data,
const Shape &weights_shape,
100 const float *weights_data,
const Shape &,
const float *bias_data,
101 const Shape &,
float *output_data)
103 int total_input_size = input_shape.
FlatSize();
104 int input_size = weights_shape.
Dims(1);
105 const int batch_size = total_input_size / input_size;
106 const int num_units = weights_shape.
Dims(0);
115 ZeroVector(output_data, batch_size * num_units);
132 [[maybe_unused]]
const Shape &input_shape,
const uint8_t *input_data,
133 const Shape &filter_shape,
const uint8_t *filter_data,
134 [[maybe_unused]]
const Shape &bias_shape,
const int32_t *bias_data,
147 assert(output_activation_min <= output_activation_max);
153 const int output_dim_count =
output_shape.DimensionsCount();
156 const int output_depth =
158 const int accum_depth = filter_shape.
Dims(filter_dim_count - 1);
159 for (
int b = 0; b < batches; ++b)
161 for (
int out_c = 0; out_c < output_depth; ++out_c)
164 for (
int d = 0; d < accum_depth; ++d)
166 int32_t input_val = input_data[b * accum_depth + d];
167 int32_t filter_val = filter_data[out_c * accum_depth + d];
168 acc += (filter_val + filter_offset) * (input_val + input_offset);
172 acc += bias_data[out_c];
175 acc += output_offset;
176 acc = std::max(acc, output_activation_min);
177 acc = std::min(acc, output_activation_max);
178 output_data[out_c + output_depth * b] =
static_cast<uint8_t
>(acc);
184 const float *input_data,
const Shape &filter_shape,
185 const int8_t *filter_data,
const Shape &,
const float *bias_data,
188 [[maybe_unused]] ruy::Context *ruy_context)
190 int total_input_size = input_shape.
FlatSize();
191 const int input_size = filter_shape.
Dims(1);
192 const int batch_size = total_input_size / input_size;
193 const int num_units = filter_shape.
Dims(0);
202 ZeroVector(output_data, batch_size * num_units);
213 float unused_min, unused_max;
218 for (
int b = 0; b < batch_size; ++b)
220 const int offset = b * input_size;
222 &unused_max, &scaling_factors_ptr[b]);
233 scaling_factors_ptr, batch_size, scratch, output_data,
237 scaling_factors_ptr, batch_size, output_data,
252 const float *input_data,
const Shape &weights_shape,
const float *weights_data,
254 float *output_data,
const uint16_t *w1_segments,
const uint16_t *w1_indices)
260 const int output_dims_count =
output_shape.DimensionsCount();
263 const int output_depth =
265 const int accum_depth = weights_shape.
Dims(weights_dims_count - 1);
273 ZeroVector(output_data, batches * output_depth);
275 for (
int b = 0; b < batches; ++b)
277 for (
int idx_0 = 0; idx_0 < output_depth; ++idx_0)
279 for (
int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
281 int idx_1 = w1_indices[pw1];
282 output_data[b * output_depth + idx_0] +=
283 weights_data[pw1] * input_data[b * accum_depth + idx_1];
std::vector< int8_t > input_quantized
std::vector< float > scaling_factors
std::vector< int32_t > accum_scratch
void prepare(const Shape &input_shape, const Shape &weights_shape)
int32_t DimensionsCount() const
int32_t Dims(int i) const
__global uchar * offset(const Image *img, int x, int y)
const luci_interpreter::RuntimeShape output_shape
void ZeroVector(float *vector, int v_size)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vector, const float *scaling_factors, int n_batch, float *result, int result_stride)
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
void ApplyActivationToVector(const float *vector, int v_size, FusedActivationFunctionType activation, float *result)
void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)
void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, float *min, float *max, float *scaling_factor)
void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
bool IsZeroVector(const float *vector, int v_size)
int32_t output_multiplier
FusedActivationFunctionType activation
float float_activation_max
int32_t quantized_activation_min
float float_activation_min
int32_t quantized_activation_max