ONE - On-device Neural Engine
Loading...
Searching...
No Matches
FullyConnected.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_FULLY_CONNECTED_H__
19#define __NNFW_CKER_FULLY_CONNECTED_H__
20
21#include <ruy/context.h>
25#include "cker/Shape.h"
26#include "cker/Types.h"
27#include "cker/Utils.h"
28#include "cker/TensorUtils.h"
30
31namespace nnfw
32{
33namespace cker
34{
35
37{
38public:
40 {
41 // DO NOTHING
42 }
43
44 void prepare(const Shape &input_shape, const Shape &weights_shape)
45 {
46 auto input_size = input_shape.FlatSize();
47 input_quantized.resize(input_size);
48
49 assert(weights_shape.DimensionsCount() == 2);
50 int batch_size = input_size / weights_shape.Dims(1);
51 scaling_factors.resize(batch_size);
52 prepared = true;
53 }
54
55public:
57 std::vector<int8_t> input_quantized;
58 std::vector<float> scaling_factors;
59 std::vector<int32_t> accum_scratch;
60};
61
62#if defined(CKER_X86_PLATFORM)
63
64// From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
65inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
66 const float *input_data, const Shape &weights_shape,
67 const float *weights_data, const Shape &,
68 const float *optional_bias_data, const Shape &output_shape,
69 float *output_data)
70{
71 const int dims_count = weights_shape.DimensionsCount();
72 const int input_rows = weights_shape.Dims(dims_count - 1);
73 MatrixParams<float> rhs_params;
74 rhs_params.order = Order::kColMajor;
75 rhs_params.rows = input_rows;
76 rhs_params.cols = input_shape.FlatSize() / input_rows;
77 rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable);
78
79 MatrixParams<float> lhs_params;
80 lhs_params.order = Order::kRowMajor;
81 lhs_params.cols = weights_shape.Dims(dims_count - 1);
82 lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
83 lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable);
84 MatrixParams<float> dst_params;
85 dst_params.order = Order::kColMajor;
86 dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
87 dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
88 GemmParams<float, float> gemm_params;
89 gemm_params.bias = optional_bias_data;
90 gemm_params.clamp_min = params.float_activation_min;
91 gemm_params.clamp_max = params.float_activation_max;
92 optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
93 gemm_params);
94}
95
96#else // CKER_X86_PLATFORM
97
98inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
99 const float *input_data, const Shape &weights_shape,
100 const float *weights_data, const Shape &, const float *bias_data,
101 const Shape &, float *output_data)
102{
103 int total_input_size = input_shape.FlatSize();
104 int input_size = weights_shape.Dims(1);
105 const int batch_size = total_input_size / input_size;
106 const int num_units = weights_shape.Dims(0);
107
108 // Output = bias if bias tensor exists.
109 if (bias_data)
110 {
111 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
112 }
113 else
114 {
115 ZeroVector(output_data, batch_size * num_units);
116 }
117
118 // Compute output += weight * input
119 MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
120 output_data, /*result_stride=*/1);
121
123 {
124 // Apply activation function
125 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
126 }
127}
128
129#endif // CKER_X86_PLATFORM
130
131inline void FullyConnected(const FullyConnectedParams &params,
132 [[maybe_unused]] const Shape &input_shape, const uint8_t *input_data,
133 const Shape &filter_shape, const uint8_t *filter_data,
134 [[maybe_unused]] const Shape &bias_shape, const int32_t *bias_data,
135 const Shape &output_shape, uint8_t *output_data)
136{
137 const int32_t input_offset = params.input_offset;
138 const int32_t filter_offset = params.weights_offset;
139 const int32_t output_offset = params.output_offset;
140 const int32_t output_multiplier = params.output_multiplier;
141 const int output_shift = params.output_shift;
142 const int32_t output_activation_min = params.quantized_activation_min;
143 const int32_t output_activation_max = params.quantized_activation_max;
144 assert(filter_shape.DimensionsCount() >= 2);
145 assert(output_shape.DimensionsCount() >= 1);
146
147 assert(output_activation_min <= output_activation_max);
148 // TODO(benoitjacob): This really should be:
149 // const int batches = ArraySize(output_dims, 1);
150 // but the current --variable_batch hack consists in overwriting the 3rd
151 // dimension with the runtime batch size, as we don't keep track for each
152 // array of which dimension is the batch dimension in it.
153 const int output_dim_count = output_shape.DimensionsCount();
154 const int filter_dim_count = filter_shape.DimensionsCount();
155 const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
156 const int output_depth =
157 MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
158 const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
159 for (int b = 0; b < batches; ++b)
160 {
161 for (int out_c = 0; out_c < output_depth; ++out_c)
162 {
163 int32_t acc = 0;
164 for (int d = 0; d < accum_depth; ++d)
165 {
166 int32_t input_val = input_data[b * accum_depth + d];
167 int32_t filter_val = filter_data[out_c * accum_depth + d];
168 acc += (filter_val + filter_offset) * (input_val + input_offset);
169 }
170 if (bias_data)
171 {
172 acc += bias_data[out_c];
173 }
174 acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
175 acc += output_offset;
176 acc = std::max(acc, output_activation_min);
177 acc = std::min(acc, output_activation_max);
178 output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
179 }
180 }
181}
182
183inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape,
184 const float *input_data, const Shape &filter_shape,
185 const int8_t *filter_data, const Shape &, const float *bias_data,
186 [[maybe_unused]] const Shape &output_shape, float *output_data,
187 FCTempArena &temp_arena,
188 [[maybe_unused]] ruy::Context *ruy_context)
189{
190 int total_input_size = input_shape.FlatSize();
191 const int input_size = filter_shape.Dims(1);
192 const int batch_size = total_input_size / input_size;
193 const int num_units = filter_shape.Dims(0);
194
195 // Output = bias if bias tensor exists.
196 if (bias_data)
197 {
198 VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
199 }
200 else
201 {
202 ZeroVector(output_data, batch_size * num_units);
203 }
204
205 // Save matrix multiplication computation for all zero input.
206 if (IsZeroVector(input_data, total_input_size))
207 {
208 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
209 return;
210 }
211
212 // Quantize input from float to uint8 + quantization params (scaling factor).
213 float unused_min, unused_max;
214 float *scaling_factors_ptr = temp_arena.scaling_factors.data();
215 int8_t *quant_data = temp_arena.input_quantized.data();
216
217 // Quantize each batch independently.
218 for (int b = 0; b < batch_size; ++b)
219 {
220 const int offset = b * input_size;
221 SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
222 &unused_max, &scaling_factors_ptr[b]);
223 // Incorporate scaling of the filter.
224 scaling_factors_ptr[b] *= params.weights_scale;
225 }
226
227// Compute output += weight * quantized_input
228#ifdef USE_RUY_GEMV
229 auto output_size = output_shape.FlatSize();
230 temp_arena.accum_scratch.resize(output_size);
231 int32_t *scratch = temp_arena.accum_scratch.data();
232 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
233 scaling_factors_ptr, batch_size, scratch, output_data,
234 /*result_stride=*/1, ruy_context);
235#else
236 MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
237 scaling_factors_ptr, batch_size, output_data,
238 /*result_stride=*/1);
239#endif
240
241 // Apply activation function to floats.
243 {
244 // Apply activation function
245 ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
246 }
247 return;
248}
249
251 const FullyConnectedParams &params, [[maybe_unused]] const Shape &input_shape,
252 const float *input_data, const Shape &weights_shape, const float *weights_data,
253 [[maybe_unused]] const Shape &bias_shape, const float *bias_data, const Shape &output_shape,
254 float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
255{
256
257 assert(weights_shape.DimensionsCount() == 2);
258 assert(output_shape.DimensionsCount() == 2);
259
260 const int output_dims_count = output_shape.DimensionsCount();
261 const int weights_dims_count = weights_shape.DimensionsCount();
262 const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
263 const int output_depth =
264 MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
265 const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
266
267 if (bias_data)
268 {
269 VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
270 }
271 else
272 {
273 ZeroVector(output_data, batches * output_depth);
274 }
275 for (int b = 0; b < batches; ++b)
276 {
277 for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
278 {
279 for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
280 {
281 int idx_1 = w1_indices[pw1];
282 output_data[b * output_depth + idx_0] +=
283 weights_data[pw1] * input_data[b * accum_depth + idx_1];
284 }
285 }
286 }
288 {
289 // Apply activation function
290 ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
291 }
292}
293
294} // namespace cker
295} // namespace nnfw
296
297#endif // __NNFW_CKER_FULLY_CONNECTED_H__
std::vector< int8_t > input_quantized
std::vector< float > scaling_factors
std::vector< int32_t > accum_scratch
void prepare(const Shape &input_shape, const Shape &weights_shape)
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
int FlatSize() const
Definition Shape.h:181
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
const luci_interpreter::RuntimeShape output_shape
void ZeroVector(float *vector, int v_size)
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:220
void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &bias_shape, const float *bias_data, const Shape &output_shape, float *output_data, const uint16_t *w1_segments, const uint16_t *w1_indices)
void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vector, const float *scaling_factors, int n_batch, float *result, int result_stride)
int FlatSizeSkipDim(const Shape &shape, int skip_dim)
Definition Shape.h:253
void ApplyActivationToVector(const float *vector, int v_size, FusedActivationFunctionType activation, float *result)
void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, FCTempArena &temp_arena, ruy::Context *ruy_context)
void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, const Shape &, float *output_data)
void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, float *min, float *max, float *scaling_factor)
void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)
Definition TensorUtils.h:44
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int shift)
Definition Utils.h:96
bool IsZeroVector(const float *vector, int v_size)
Definition topk_v2.h:30
FusedActivationFunctionType activation
Definition Types.h:257
DstScalar clamp_max
Definition Types.h:537
const AccumScalar * bias
Definition Types.h:531
DstScalar clamp_min
Definition Types.h:533
CachePolicy cache_policy
Definition Types.h:456