65 Tensor tensor(DT, shape, {{scale}, {zero_point}},
"");
66 std::vector<NativeT> quantized_data =
67 quantize<NativeT>(
data.data(),
data.size(), scale, zero_point);
69 tensor.writeData(quantized_data.data(), quantized_data.size() *
sizeof(NativeT));
86 const std::vector<int32_t> &zero_points,
int quantized_dimension,
90 assert(quantized_dimension < shape.
num_dims());
91 Tensor tensor(DT, shape, {scales, zero_points, quantized_dimension},
"");
96 size_t outer_dims_size = 1;
97 int32_t quant_dim_size = shape.
dim(quantized_dimension);
98 size_t inner_dims_size = 1;
99 assert(quant_dim_size == scales.size());
100 assert(quant_dim_size == zero_points.size());
102 for (
int i = 0; i < quantized_dimension; ++i)
103 outer_dims_size *= shape.
dim(i);
104 for (
int i = quantized_dimension + 1; i < shape.
num_dims(); ++i)
105 inner_dims_size *= shape.
dim(i);
107 assert(shape.
num_elements() == outer_dims_size * quant_dim_size * inner_dims_size);
109 std::vector<NativeT> quantized_data;
111 for (
size_t outer_it = 0; outer_it < outer_dims_size; ++outer_it)
112 for (int32_t channel = 0; channel < quant_dim_size; ++channel)
114 int32_t zero_point = zero_points[channel];
115 float scale = scales[channel];
116 size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
117 std::vector<NativeT> part_quantized_data =
118 quantize<NativeT>(
data.data() +
offset, inner_dims_size, scale, zero_point);
119 quantized_data.insert(quantized_data.end(), part_quantized_data.begin(),
120 part_quantized_data.end());
124 tensor.writeData(quantized_data.data(), quantized_data.size() *
sizeof(NativeT));
136 if (std::is_same<T, float>::value)
137 return DataType::FLOAT32;
138 if (std::is_same<T, double>::value)
139 return DataType::FLOAT64;
140 if (std::is_same<T, uint8_t>::value)
142 if (std::is_same<T, uint16_t>::value)
143 return DataType::U16;
144 if (std::is_same<T, uint32_t>::value)
145 return DataType::U32;
146 if (std::is_same<T, uint64_t>::value)
147 return DataType::U64;
148 if (std::is_same<T, int8_t>::value)
150 if (std::is_same<T, int16_t>::value)
151 return DataType::S16;
152 if (std::is_same<T, int32_t>::value)
153 return DataType::S32;
154 if (std::is_same<T, int64_t>::value)
155 return DataType::S64;
156 if (std::is_same<T, bool>::value)
157 return DataType::BOOL;
158 return DataType::Unknown;
174std::vector<T>
quantize(
const float *
data,
size_t num_elements,
float scale, int32_t zero_point)
176 static_assert(std::is_integral<T>::value,
"Integral type expected.");
178 float q_min{}, q_max{};
179 if (std::is_signed<T>::value)
181 q_min = -std::numeric_limits<T>::max();
182 q_max = std::numeric_limits<T>::max();
187 q_max = std::numeric_limits<T>::max();
191 for (
size_t i = 0; i < num_elements; ++i)
193 const auto &f =
data[i];
194 q.push_back(
static_cast<T
>(
195 std::max<float>(q_min, std::min<float>(q_max, std::round(zero_point + (f / scale))))));
216 static_assert(std::is_integral<T>::value,
"Integral type expected.");
217 int32_t zero_point = 0;
219 const T qmin = std::numeric_limits<T>::lowest();
220 const T qmax = std::numeric_limits<T>::max();
221 const float qmin_double = qmin;
222 const float qmax_double = qmax;
232 return {scale, zero_point};
238 scale = (f_max - f_min) / (qmax_double - qmin_double);
248 const float zero_point_from_min = qmin_double - f_min / scale;
249 const float zero_point_from_max = qmax_double - f_max / scale;
251 const float zero_point_from_min_error = std::abs(qmin_double) + std::abs(f_min / scale);
253 const float zero_point_from_max_error = std::abs(qmax_double) + std::abs(f_max / scale);
255 const float zero_point_double = zero_point_from_min_error < zero_point_from_max_error
256 ? zero_point_from_min
257 : zero_point_from_max;
265 T nudged_zero_point = 0;
266 if (zero_point_double < qmin_double)
268 nudged_zero_point = qmin;
270 else if (zero_point_double > qmax_double)
272 nudged_zero_point = qmax;
276 nudged_zero_point =
static_cast<T
>(std::round(zero_point_double));
281 assert(qmax >= nudged_zero_point);
282 assert(qmin <= nudged_zero_point);
283 zero_point = nudged_zero_point;
285 return {scale, zero_point};