51 int8_t *output_data,
const float scale,
const int32_t zero_point)
54 static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
55 static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
59 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
60 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
61 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
62 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
64 for (; i <= flat_size - 8; i += 8)
66 const float *src_data_ptr = input_data + i;
67 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
68 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
70 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
71 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
76 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
77 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
80 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
81 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
82 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
83 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
85 const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
86 const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
87 const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
88 const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
89 vst1_s8(output_data + i, combined_val_narrowed);
93 for (; i < flat_size; ++i)
95 const float val = input_data[i];
96 const int32_t unclamped =
static_cast<int32_t
>(round(val / scale)) + zero_point;
97 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
98 output_data[i] = clamped;
104 uint8_t *output_data,
const float scale,
const int32_t zero_point)
107 static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
108 static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
112 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
113 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
114 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
115 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
117 for (; i <= flat_size - 8; i += 8)
119 const float *src_data_ptr = input_data + i;
120 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
121 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
123 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
124 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
129 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
130 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
133 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
134 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
135 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
136 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
138 const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
139 const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
140 const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1);
141 const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
142 vst1_u8(output_data + i, combined_val_narrowed);
146 for (; i < flat_size; ++i)
148 const float val = input_data[i];
149 const int32_t unclamped =
static_cast<int32_t
>(round(val / scale)) + zero_point;
150 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
151 output_data[i] = clamped;
157 int16_t *output_data,
const float scale,
const int32_t zero_point)
160 static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
161 static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
165 const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
166 const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
167 const int32x4_t min_val_dup = vdupq_n_s32(min_val);
168 const int32x4_t max_val_dup = vdupq_n_s32(max_val);
170 for (; i <= flat_size - 8; i += 8)
172 const float *src_data_ptr = input_data + i;
173 float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
174 float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
176 input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
177 input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
182 casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
183 casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
186 casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
187 casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
188 casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
189 casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
191 const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
192 const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
193 vst1_s16(output_data + i, narrowed_val_0);
194 vst1_s16(output_data + i + 4, narrowed_val_1);
198 for (; i < flat_size; ++i)
200 const float val = input_data[i];
201 const int32_t unclamped =
static_cast<int32_t
>(round(val / scale)) + zero_point;
202 const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
203 output_data[i] = clamped;
207inline void Quantize(
const int32_t *multiplier,
const int32_t *shift, int32_t channel_size,
208 int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max,
209 int32_t *scratch, int8_t *output)
222 const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
223 const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
224 const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
225 const int32x4_t zeros = vdupq_n_s32(0);
228 assert(total_size % channel_size == 0);
229 const int32_t rows = total_size / channel_size;
234 using gemmlowp::RoundingDivideByPOT;
235 for (; c <= channel_size - 8; c += 8)
237 int32x4_t out_shift_1 = vld1q_s32(shift + c);
238 int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
239 int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
240 int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
243 int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
244 int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
246 int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
247 int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
248 for (
int n = 0; n < rows; ++n)
250 int loc = n * channel_size + c;
251 int32x4_t acc_1 = vld1q_s32(scratch + loc);
252 int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
255 acc_1 = vshlq_s32(acc_1, left_shift_1);
256 acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
257 acc_2 = vshlq_s32(acc_2, left_shift_2);
258 acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
261 acc_1 = vrshlq_s32(acc_1, right_shift_1);
262 acc_2 = vrshlq_s32(acc_2, right_shift_2);
265 acc_1 = vaddq_s32(acc_1, output_offset_vec);
266 acc_2 = vaddq_s32(acc_2, output_offset_vec);
269 acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
270 acc_1 = vminq_s32(acc_1, output_activation_max_vec);
271 acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
272 acc_2 = vminq_s32(acc_2, output_activation_max_vec);
275 const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
276 const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
277 const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
278 const int8x8_t res_s8 = vqmovn_s16(res_s16);
279 vst1_s8(output + loc, res_s8);
285 for (; c < channel_size; c++)
287 for (
int n = 0; n < rows; ++n)
289 int loc = n * channel_size + c;
290 int32_t acc = scratch[loc];
293 acc = std::max(acc, output_min);
294 acc = std::min(acc, output_max);
295 output[loc] =
static_cast<int8_t
>(acc);
312 int32_t effective_scale_multiplier,
313 int32_t effective_scale_shift, int32_t input_zeropoint,
314 int32_t output_zeropoint, int8_t *output_data)
316 static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
317 static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
322 const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
323 const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
324 const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
325 const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
327 for (; i <=
size - 16; i += 16)
329 const uint8x16_t input_vec = vld1q_u8(input_data + i);
330 const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
331 const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
333 input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
334 input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
335 input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
336 input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
337 input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
338 input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
339 input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
340 input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
343 MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
345 result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
346 result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
347 result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
348 result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
349 result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
350 result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
351 result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
352 result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
354 const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
355 const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
356 const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
357 const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
358 const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
359 const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
360 const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
361 const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
362 const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
363 vst1q_s8(output_data + i, narrowed_result);
367 for (; i <
size; ++i)
369 const int32_t input = input_data[i] - input_zeropoint;
370 const int32_t output =
373 const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
374 output_data[i] =
static_cast<int8_t
>(clamped_output);
380 int32_t effective_scale_multiplier,
381 int32_t effective_scale_shift, int32_t input_zeropoint,
382 int32_t output_zeropoint, uint8_t *output_data)
384 static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
385 static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
390 const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
391 const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
392 const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
393 const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
395 for (; i <=
size - 16; i += 16)
397 const int8x16_t input_vec = vld1q_s8(input_data + i);
398 const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
399 const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
401 input.val[0] = vmovl_s16(vget_low_s16(first_half));
402 input.val[1] = vmovl_s16(vget_high_s16(first_half));
403 input.val[2] = vmovl_s16(vget_low_s16(second_half));
404 input.val[3] = vmovl_s16(vget_high_s16(second_half));
405 input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
406 input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
407 input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
408 input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
411 MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
413 result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
414 result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
415 result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
416 result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
417 result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
418 result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
419 result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
420 result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
422 const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
423 const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
424 const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
425 const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
427 const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
428 const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
429 const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
430 const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
431 const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
432 const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
433 const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
434 const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
435 const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
436 vst1q_u8(output_data + i, narrowed_result);
440 for (; i <
size; ++i)
442 const int32_t input = input_data[i] - input_zeropoint;
443 const int32_t output =
446 const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
447 output_data[i] =
static_cast<uint8_t
>(clamped_output);