ONE - On-device Neural Engine
Loading...
Searching...
No Matches
NeonTensorUtils.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_NEON_TENSOR_UTILS_H__
19#define __NNFW_CKER_NEON_TENSOR_UTILS_H__
20
21#include <ruy/path.h>
22#include <ruy/ruy.h>
23#include "cker/Types.h"
25#include "cker/ruy/RuySupport.h"
26#include "util/logging.h"
27#if defined __linux__ && defined __aarch64__
28#include <sys/auxv.h>
29#endif
30
31#include <cassert>
32#include <cmath>
33
34#ifdef USE_NEON
35
36#define kFloatWeightsPerNeonLane 4
37
38namespace nnfw
39{
40namespace cker
41{
42
43namespace
44{
45
46constexpr int kFloatValuesPerNeonVector = 4;
47
48// TODO(ahentz): Clean up.
49using int8 = std::int8_t;
50using uint8 = std::uint8_t;
51using int16 = std::int16_t;
52using uint16 = std::uint16_t;
53using int32 = std::int32_t;
54using uint32 = std::uint32_t;
55
56template <int PerNeonSize> inline int RoundDownVectors(int size)
57{
58 return size & ~(PerNeonSize - 1);
59}
60
61// Allocates, at least, size bytes of uninitialized storage whose alignment is
62// specified by alignment. The size parameter must be an integral multiple of
63// alignment.
64// Caller is responsible by freeing the allocated memory by calling free on
65// the passed freeing_buffer pointer.
66void *aligned_alloc(size_t alignment, size_t size, void **freeing_buffer)
67{
68 *freeing_buffer = malloc(size + alignment);
69 const size_t offset = ((uintptr_t)*freeing_buffer) % alignment; // NOLINT
70 return offset == 0 ? *freeing_buffer : ((char *)*freeing_buffer + (alignment - offset)); // NOLINT
71}
72
73inline int32_t AccumulateNeonLane(const int32x4_t lane)
74{
75#ifdef __aarch64__
76 return vaddvq_s32(lane);
77#else
78 int64x2_t pairwiseAdded = vpaddlq_s32(lane);
79 return vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
80#endif
81}
82
83} // namespace
84
85// The implementation of dotprod detection is copied from ruy's internal
86// function DetectDotprod().
87// At the moment it's only implemented on Linux ARM64. Consider syncing again
88// with ruy in the future to share improvements.
89#if defined __linux__ && defined __aarch64__
90inline bool DetectDotprodByLinuxAuxvMethod()
91{
92 // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
93 // however we need to support building against older headers for the time
94 // being.
95 const int kLocalHwcapAsimddp = 1 << 20;
96 return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
97}
98#endif
99
100inline bool DetectArmNeonDotprod()
101{
102#if defined __linux__ && defined __aarch64__
103 return DetectDotprodByLinuxAuxvMethod();
104#endif
105
106 return false;
107}
108
109inline bool HasSdotInstruction()
110{
111 static const bool has_dotprod = DetectArmNeonDotprod();
112 return has_dotprod;
113}
114
115#ifdef __aarch64__
116// We interleave vector data to make the dot product logic more efficient.
117// Suppose that vectors is:
118// a0 a1 a2 a3 a4 a5 ...
119// b0 b1 b2 b3 b4 b5 ...
120// c0 c1 c2 c3 c4 c5 ...
121// d0 d1 d2 d3 d4 d5 ...
122// e0 e1 e2 e3 e4 e5 ...
123// This code interleaves them like this:
124// a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3 d0 d1 d2 d3 a4 a5 a6 a7 b4 ...
125// e0 e1 e2 e3 f0 f1 f2 f3 ...
126// Once the data is interleaved, each 16-byte read from the vectors pointer
127// contains 4 bytes from each of 4 vectors.
128inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
129 void **shuffled_vectors_free)
130{
131 const int kWeightsPerUint32 = 4;
132
133 int8 *shuffled_vectors = reinterpret_cast<int8 *>(
134 aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
135
136 for (int i = 0; i < n_batch; i += 4)
137 {
138 int8 *shuffled_vectors_ptr = shuffled_vectors + (i * m_cols);
139 const int8 *unshuffled_vec0_ptr = reinterpret_cast<const int8 *>(vectors) + (i * m_cols);
140 const int8 *unshuffled_vec1_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 1) * m_cols);
141 const int8 *unshuffled_vec2_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 2) * m_cols);
142 const int8 *unshuffled_vec3_ptr = reinterpret_cast<const int8 *>(vectors) + ((i + 3) * m_cols);
143 const int8 *const end_vec0_ptr = unshuffled_vec1_ptr;
144
145 while (unshuffled_vec0_ptr != end_vec0_ptr)
146 {
147 asm volatile(
148 // This code path requires that (n_cols % 16) == 0 so we can safely
149 // read in 16-byte chunks from each row.
150 "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
151 "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
152 "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
153 "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
154
155 "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
156 "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
157 "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
158 "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
159
160 : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
161 [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
162 [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
163 [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
164 [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
165 :
166 : "v0", "v1", "v2", "v3", "cc", "memory");
167 }
168 }
169
170 return reinterpret_cast<const int8_t *>(shuffled_vectors);
171}
172
173// Notes about the speed of this version vs. the baseline (from memory):
174// - With 256K of L1, we can keep a lot of vectors in cache.
175// I recall a reasonable speedup just by rearranging the loop to have
176// row on the outside and batch on the inside.
177// - I also recall getting a nice speedup from sdot.
178// - I tried many times to do better than the current implementation, using
179// loop unrolling and instruction reordering to avoid stalls, etc.
180// but I was not able to do significantly better. This code is, however,
181// much worse than what the processor spec sheet suggests is possible.
182static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
183 const int m_rows, const int m_cols,
184 const int8_t *vectors,
185 const float *scaling_factors,
186 int n_batch, float *__restrict__ result)
187{
188 void *shuffled_vectors_free;
189
190 const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
191
192 for (int row = 0; row < m_rows; row += 2)
193 {
194 for (int batch = 0; batch < n_batch; batch += 4)
195 {
196 float *result_ptr = result + (batch * m_rows) + row;
197 const int8 *mat_ptr0 = matrix + (row * m_cols);
198 const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols);
199 const int8 *mat_ptr0_end = mat_ptr1;
200 const int8 *vec_ptr = shuffled_vectors + (batch * m_cols);
201 const float *scaling_factors_ptr = scaling_factors + batch;
202 const uint64_t wide_rows = m_rows * sizeof(float);
203 const int8 *mat_ptr2 = matrix + ((row + 2) * m_cols);
204 const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
205
206 asm volatile(
207 // Zero out the accumulator registers.
208 "dup v0.4s, wzr\n"
209 "dup v1.4s, wzr\n"
210 "dup v2.4s, wzr\n"
211 "dup v3.4s, wzr\n"
212
213 "1:\n" // batch_cols_loop
214
215 // Read 16 more bytes from a pair of matrix rows.
216 "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
217
218 // Prefetch two rows ahead.
219 "prfm pldl1strm, [%[mat_ptr2]]\n"
220 "prfm pldl1strm, [%[mat_ptr3]]\n"
221
222 // Read from input vectors 4 times; 64 bytes total.
223 // Each 16-byte register contains parts of 4 vectors; see the
224 // shuffle logic above.
225
226 // From Benoit, places to look in the future:
227 // - Move load instructions further from sdot
228 // - Switch loop use-then-reload
229 // - Do partial unrolling to use register space better
230 "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
231 ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
232 "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
233 ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
234 "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
235 ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
236 "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
237 ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
238
239 // Update prefetch pointers.
240 "add %[mat_ptr2], %[mat_ptr2], #16\n"
241 "add %[mat_ptr3], %[mat_ptr3], #16\n"
242
243 // Re-use those vectors for the next row as well.
244 "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
245 ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
246 ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
247 ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
248 ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
249
250 // If we're not done with these rows, continue.
251 "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
252 "bne 1b\n" // batch_cols_loop
253
254 // Done with the rows, sum the results.
255 "add v0.4s, v0.4s, v1.4s\n"
256 "add v2.4s, v2.4s, v3.4s\n"
257
258 // Convert the per-vector sums to floating point.
259 "scvtf v0.4s, v0.4s\n"
260 "scvtf v1.4s, v2.4s\n"
261
262 // Fetch scale factors.
263 "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
264
265 // Multiply scale factors times sums.
266 "fmul v0.4s, v4.4s, v0.4s\n"
267 "fmul v1.4s, v4.4s, v1.4s\n"
268
269 // Load previous result values.
270 // The result position is:
271 // result[batch * m_rows + row]
272 // Here that is factored into:
273 // result_ptr = result + row
274 // *result_ptr = res[0]
275 // (uint8*)result_ptr += (m_rows * sizeof(float))
276 // *result_ptr = res[1]
277 // ...
278 // Since we're reading two rows at a time, though, we read both
279 // result[batch * m_rows + row]
280 // and
281 // result[batch * m_rows + row + 1]
282 "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
283 "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
284 "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
285 "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
286
287 // Go back to the starting position (subtract wide_rows * 4).
288 "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
289
290 // Add previous result values.
291 "fadd v9.4s, v9.4s, v0.4s\n"
292 "fadd v10.4s, v10.4s, v1.4s\n"
293
294 // Store results.
295 "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
296 "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
297 "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
298 "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
299 : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
300 [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
301 : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
302 [wide_rows] "r"(wide_rows)
303 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
304 "v13", "cc", "memory");
305 }
306 }
307
308 free(shuffled_vectors_free);
309}
310
311static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
312 const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
313 const float *scaling_factors, int n_batch, float *__restrict__ result,
314 const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
315{
316 void *shuffled_vectors_free;
317 const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
318
319 for (int row = 0; row < m_rows; row += 2)
320 {
321 const float *channel_scales_ptr = per_channel_scale + row;
322 int32_t *row_sums_ptr = row_sums ? row_sums + row : nullptr;
323 for (int batch = 0; batch < n_batch; batch += 4)
324 {
325 float *result_ptr = result + (batch * m_rows) + row;
326 const int8 *mat_ptr0 = matrix + (row * m_cols);
327 const int8 *mat_ptr1 = matrix + ((row + 1) * m_cols);
328 const int8 *mat_ptr0_end = mat_ptr1;
329 const int8 *vec_ptr = shuffled_vectors + (batch * m_cols);
330 const float *scaling_factors_ptr = scaling_factors + batch;
331 const uint64_t wide_rows = m_rows * sizeof(float);
332 const int32_t *batch_offsets_ptr = input_offset + batch;
333 const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
334 const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
335 asm volatile(
336 "dup v0.4s, wzr\n"
337 "dup v1.4s, wzr\n"
338 "dup v2.4s, wzr\n"
339 "dup v3.4s, wzr\n"
340 // Load zero points.
341 "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
342 "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
343 // Zero out zero point accumulators.
344 "dup v14.4s, wzr\n"
345 "dup v15.4s, wzr\n"
346
347 // Load per channel scales if not null.
348 "cmp %w[is_channel_scale_nullptr], #0\n"
349 "bne 1f\n"
350 "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
351 "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
352 "fmul v16.4s, v16.4s, v4.4s\n"
353 "fmul v17.4s, v17.4s, v4.4s\n"
354 "b 2f\n"
355 "1:\n"
356 "mov v16.16b, v4.16b\n"
357 "mov v17.16b, v4.16b\n"
358 "2:\n"
359 "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
360 "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
361 ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
362 "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
363 ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
364 "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
365 ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
366 "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
367 ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
368 "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
369 ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
370 ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
371 ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
372 ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
373 "cmp %w[is_row_sums_nullptr], #1\n"
374 "bne 3f\n"
375 // Accumulate row_sums for zero point calculations.
376 "saddlp v12.8h, v12.16b\n"
377 "saddlp v13.8h, v13.16b\n"
378 "sadalp v14.4s, v12.8h\n"
379 "sadalp v15.4s, v13.8h\n"
380 "3:\n"
381 "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
382 "bne 2b\n"
383 "add v0.4s, v0.4s, v1.4s\n"
384 "add v2.4s, v2.4s, v3.4s\n"
385
386 "cmp %w[is_row_sums_nullptr], #1\n"
387 "bne 4f\n"
388 // Calculate zero point offsets.
389 "addv s14, v14.4s\n"
390 "addv s15, v15.4s\n"
391 "dup v14.4s, v14.s[0]\n"
392 "dup v15.4s, v15.s[0]\n"
393 "b 5f\n"
394 "4:\n"
395 "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
396 "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
397 "5:\n"
398
399 "mul v14.4s, v14.4s, v7.4s\n"
400 "mul v15.4s, v15.4s, v7.4s\n"
401 "sub v0.4s, v0.4s, v14.4s\n"
402 "sub v2.4s, v2.4s, v15.4s\n"
403
404 "scvtf v0.4s, v0.4s\n"
405 "scvtf v1.4s, v2.4s\n"
406
407 // Multiply scale.
408 "fmul v0.4s, v16.4s, v0.4s\n"
409 "fmul v1.4s, v17.4s, v1.4s\n"
410
411 "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
412 "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
413 "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
414 "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
415 "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
416 "fadd v9.4s, v9.4s, v0.4s\n"
417 "fadd v10.4s, v10.4s, v1.4s\n"
418 "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
419 "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
420 "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
421 "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
422 : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
423 [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
424 : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
425 [wide_rows] "r"(wide_rows), [channel_scales_ptr] "r"(channel_scales_ptr),
426 [batch_offsets_ptr] "r"(batch_offsets_ptr),
427 [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
428 [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
429 : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
430 "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
431 }
432 }
433
434 free(shuffled_vectors_free);
435}
436
437// The DotprodMatrixBatchFourVectorMultiplyAccumulate kernel processes 4
438// vectors in the same time as the baseline processes 1 vector. However, it
439// requires 4 vectors of input.
440//
441// To take advantage of this speed difference, we add some zero-valued
442// vectors to the batch so that n_batch is a multiple of 4. Then we execute
443// DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate on that padded batch,
444// then extract just the results we want at the end (ignoring the extra padding
445// outputs).
446//
447// The relative cost of the padding is large when the matrix is smaller than
448// 128x128, so we don't use this code path on small matrices. On larger
449// matrices, the computation cost dwarfs the padding cost, making this code
450// viable.
451//
452// If we ignore the cost of padding, this kernel is:
453// 1x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 1
454// 2x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 2
455// 3x the speed of NeonMatrixBatchVectorMultiplyImpl for n_batch = 3
456// ...
457//
458// We don't use this kernel when n_batch = 1 because the baseline kernel
459// is fine for that case.
460inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
461 const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
462 const float *scaling_factors, int n_batch, float *__restrict__ result,
463 const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
464{
465 const int kWeightsPerUint32 = 4;
466
467 // Round to the nearest multiple of 4.
468 int batch_round_up = n_batch;
469 if (n_batch % 4 != 0)
470 {
471 batch_round_up += (4 - n_batch % 4);
472 }
473 assert(n_batch <= batch_round_up);
474
475 void *padded_vectors_free;
476 const int padded_vectors_size = batch_round_up * m_cols;
477 int8_t *padded_vectors = reinterpret_cast<int8_t *>(
478 aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
479 memset(padded_vectors, 0, padded_vectors_size);
480
481 void *padded_result_free;
482 const int result_size = n_batch * m_rows * sizeof(float);
483 const int padded_result_size = batch_round_up * m_rows * sizeof(float);
484 float *padded_result = reinterpret_cast<float *>(
485 aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
486 memcpy(padded_result, result, result_size);
487 memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
488 padded_result_size - result_size);
489
490 // Copy the input into the padded data structure.
491 assert(n_batch * m_cols <= padded_vectors_size);
492 memcpy(padded_vectors, vectors, n_batch * m_cols);
493
494 void *padded_scaling_factors_free;
495 const int padded_scaling_factors_size = batch_round_up * sizeof(float);
496 float *padded_scaling_factors = reinterpret_cast<float *>(
497 aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
498 assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
499 assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
500 memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
501 memcpy(padded_scaling_factors, scaling_factors, n_batch * sizeof(float));
502
503 if (input_offset != nullptr)
504 {
505 void *padded_input_offset_free;
506 const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
507 int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
508 aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
509 assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
510 assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
511 memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
512 memcpy(padded_input_offset, input_offset, n_batch * sizeof(int32_t));
513
514 // Call the main kernel.
515 DotprodMatrixBatchFourVectorMultiplyAccumulate(
516 matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result,
517 per_channel_scale, padded_input_offset, row_sums);
518
519 free(padded_input_offset_free);
520 }
521 else
522 {
523 // Call the main kernel.
524 DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, padded_vectors,
525 padded_scaling_factors, batch_round_up,
526 padded_result);
527 }
528 memcpy(result, padded_result, result_size);
529
530 free(padded_result_free);
531 free(padded_vectors_free);
532 free(padded_scaling_factors_free);
533}
534
535inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
536 const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
537 const float *scaling_factors, int n_batch, float *__restrict__ result)
538{
539 DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
540 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
541 /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
542 /*row_sums=*/nullptr);
543}
544#endif // __aarch64__
545
546inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value)
547{
548 const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
549 const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
550
551 int i = 0;
552 for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector)
553 {
554 // Load from memory to vector.
555 float32x4_t v_f32x4 = vld1q_f32(vector + i);
556 // Clip between clipping_value and -clipping_value.
557 v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
558 v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
559 // Save to output.
560 vst1q_f32(vector + i, v_f32x4);
561 }
562 for (; i < v_size; i++)
563 {
564 vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
565 }
566}
567
568inline bool NeonIsZeroVector(const float *vector, int v_size)
569{
570 // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
571 // use the main vectorized loop, and we need to process sequentially.
572 // postamble_start shows the start index where this should happen.
573 const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
574
575 const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
576 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
577 {
578 const float32x4_t i_x4_float = vld1q_f32(vector + v);
579 uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
580 if (vgetq_lane_u32(cmp_result, 0) == 0)
581 return false;
582 if (vgetq_lane_u32(cmp_result, 1) == 0)
583 return false;
584 if (vgetq_lane_u32(cmp_result, 2) == 0)
585 return false;
586 if (vgetq_lane_u32(cmp_result, 3) == 0)
587 return false;
588 }
589
590 // Postamble loop
591 for (int v = postamble_start; v < v_size; ++v)
592 {
593 if (vector[v] != 0.0)
594 return false;
595 }
596 return true;
597}
598
599inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
600 const int8_t *input_to_gate_weights, int32_t n_batch,
601 int32_t n_input, int32_t n_output, int32_t, int32_t *scratch,
602 ruy::Context *ruy_context)
603{
604 MatrixParams<int8_t> lhs_params;
605 lhs_params.order = Order::kRowMajor;
606 lhs_params.rows = n_output;
607 lhs_params.cols = n_input;
608 lhs_params.cache_policy = CachePolicy::kAlwaysCache;
609
610 MatrixParams<int8_t> rhs_params;
611 rhs_params.order = Order::kColMajor;
612 rhs_params.rows = n_input;
613 rhs_params.cols = n_batch;
614
615 MatrixParams<int32_t> dst_params;
616 dst_params.order = Order::kColMajor;
617 dst_params.rows = n_output;
618 dst_params.cols = n_batch;
619
620 GemmParams<int32_t, int32_t> gemm_params;
621 if (bias)
622 {
623 gemm_params.bias = bias;
624 }
625
626 // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
627 ruy::Matrix<int8_t> ruy_lhs;
628 ruy::Matrix<int8_t> ruy_rhs;
629 ruy::Matrix<int32_t> ruy_dst;
630 // Note that cache is always enabled for input and weight tensors
631 ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true);
632 ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true);
633 ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
634
635 ruy::MulParams<int32_t, int32_t> ruy_mul_params;
636 ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
637
638 ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
639}
640
641inline void NeonSub1Vector(const float *vector, int v_size, float *result)
642{
643 // If v_size is not divisible by the vector size, then we need to process the
644 // final few elements sequentially. postamble_start shows the start index
645 // where this should happen.
646 const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
647
648 float32x4_t one_f32x4 = vmovq_n_f32(1.0);
649 int v = 0;
650 for (; v < postamble_start; v += kFloatValuesPerNeonVector)
651 {
652 // Load 4 float values from the current pointers of the input column and
653 // subtract from 1.
654 float32x4_t v_f32x4 = vld1q_f32(vector + v);
655 float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
656 // Save to output.
657 vst1q_f32(result + v, result_f32x4);
658 }
659 for (; v < v_size; v++)
660 {
661 result[v] = 1.0f - vector[v];
662 }
663}
664
665inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
666 int8_t *quantized_values, float *min, float *max,
667 float *scaling_factor)
668{
669 // TODO(raziel): vectorize min/max calculation.
670 auto minmax = std::minmax_element(values, values + size);
671 *min = *minmax.first;
672 *max = *minmax.second;
673 const int kScale = 127;
674 const float range = std::max(std::abs(*min), std::abs(*max));
675 if (range == 0)
676 {
677 memset(quantized_values, 0, size * sizeof(int8_t));
678 *scaling_factor = 1;
679 return;
680 }
681 *scaling_factor = range / kScale;
682 const float scaling_factor_inv = kScale / range;
683
684 const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1));
685
686 // Vectorized constants.
687 const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
688 const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
689 const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
690 const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
691 const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
692
693 for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane)
694 {
695 // Implements the vectorized version of the following:
696 // const int32_t quantized_value = static_cast<int32>(
697 // std::round(*scaling_factor * values[i]));
698 // Since the vectorized round intrinsics (vrndqa_f32) is not supported
699 // on all Neon flavors, we use the following method for rounding: if (x
700 // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
701 float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
702 float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
703 float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
704 float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
705
706 int32x4_t cmp_with_zero0_ui32x4 = (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4); // NOLINT
707 int32x4_t cmp_with_zero1_ui32x4 = (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4); // NOLINT
708
709 float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
710 float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
711 cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
712 cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
713
714 mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
715 mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
716
717 int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
718 int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
719
720 // Implements the vectorized version of the folowing block:
721 // quantized_values[i] = std::min(kScale, std::max(-kScale,
722 // quantized_value));
723 int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
724 int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
725 int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
726 int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
727
728 int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
729 int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
730
731 int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
732 int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
733 vst1_s8(&quantized_values[i], min_s8x8);
734 }
735
736 for (int i = postamble_start; i < size; ++i)
737 {
738 const int32_t quantized_value =
739 static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
740 quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
741 }
742}
743
744inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
745 const int m_rows, const int m_cols,
746 const int8_t *__restrict__ vectors,
747 const float *scaling_factors, int n_batch,
748 float *__restrict__ result, int result_stride)
749{
750#ifdef __aarch64__
751 if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch)
752 {
753 if (n_batch % 4 == 0 && result_stride == 1)
754 {
755 // Benchmarks suggest that it's always better to use the batch code
756 // when we can, even on small matrices.
757 DotprodMatrixBatchFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
758 scaling_factors, n_batch, result);
759 return;
760 }
761 else if (result_stride == 1 && n_batch >= 2 && m_rows * m_cols >= 128 * 128)
762 {
763 DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
764 scaling_factors, n_batch, result);
765 return;
766 }
767 }
768#endif // __aarch64__
769
770 static const int kWeightsPerUint32 = 4;
771 static const int kWeightsPerNeonLane = 16;
772 // Assuming *matrix is kWeightsPerUint32-byte aligned,
773 // every row of the matrix is also
774 // kWeightsPerUint32-byte aligned as long as cols is
775 // a multiple of kWeightsPerUint32. The assumption
776 // is currently satisfied by TFLite's 16-byte memory
777 // alignment scheme.
778 //
779 // Otherwise, we allocate an aligned memory block and set
780 // a flag to later copy rows from matrix to the block
781 // for aligned multiplication.
782 bool unaligned = false;
783 int8_t *aligned_row = nullptr;
784 void *aligned_row_free = nullptr;
785 if ((m_cols & (kWeightsPerUint32 - 1)) != 0)
786 {
787 unaligned = true;
788 aligned_row = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
789 &aligned_row_free);
790 }
791 void *aligned_vec_free = nullptr;
792 int8_t *aligned_vec = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
793 &aligned_vec_free);
794
795 // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
796 // vectorized loop, and we need to process sequentially. postamble_half_start
797 // shows the start index where this should happen. Between postamble_start and
798 // postamble_half_start we can still process kWeightsPerNeonLane >> 1 in a
799 // vectorized form.
800 const int postamble_half_start = m_cols & ~(kWeightsPerNeonLane - 1);
801 const int postamble_start = m_cols & ~((kWeightsPerNeonLane >> 1) - 1);
802
803 for (int batch = 0; batch < n_batch; ++batch)
804 {
805 const float batch_scaling_factor = scaling_factors[batch];
806 // Copy the vector data to an aligned vector.
807 memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
808 // Compute dot-product for every column.
809 for (int row = 0; row < m_rows; ++row, result += result_stride)
810 {
811 // Get the address of the first element of the row.
812 int8_t *row_ptr = (int8_t *)matrix + row * m_cols; // NOLINT
813 if (unaligned)
814 {
815 memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
816 row_ptr = aligned_row;
817 }
818
819 // Initialize the dot product sum for the row to 0.
820 int32x4_t dotprod_32x4 = vmovq_n_s32(0);
821
822 // Prefetch the row to cache.
823 __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
824
825 // For every block of 16 8-bit elements.
826 int col = 0;
827 for (; col < postamble_half_start; col += kWeightsPerNeonLane)
828 {
829 // Load 16 8-bit values from the row and vector, each, to operate on.
830 // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
831 // performance may suffer significantly.
832 assert( // NOLINT
833 ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
834 const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
835 const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
836 // Multiply the low bits (i.e. the lower 8 8bit numbers in the
837 // registers).
838 int16x8_t prod_16x8 = vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
839 // Multiply the high bits (i.e. the higher 8 8bit numbers in the
840 // registers), and accumulate with the result of the low bits product.
841 // The assumption here is that overflow will not happen as we quantize
842 // our values to be in the range [-127, 127]. As such the sum of the 2
843 // products is always strictly smaller than 15-bits (32767 in absolute
844 // value).
845 prod_16x8 = vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
846
847 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
848 } // for col
849
850 // Half iteration dealing only 8 elements
851 // TODO(raziel): if (ABSL_PREDICT_FALSE(col < postamble_start))
852 if (col < postamble_start)
853 {
854 // Load 8 8-bit values from the row and column each to operate on.
855 // Here the assumption is that each buffer is 4-bytes aligned.
856 // Otherwise, performance may suffer significantly.
857 assert( // NOLINT
858 ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
859 const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
860 const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
861 const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
862 dotprod_32x4 = vpadalq_s16(dotprod_32x4, prod_16x8);
863 col += (kWeightsPerNeonLane >> 1);
864 }
865 // Add the 4 intermediate sum values to get the final dot-prod value for
866 // this row.
867 int32_t dotprod = AccumulateNeonLane(dotprod_32x4);
868 // Postamble loop.
869 // TODO(raziel): if (ABSL_PREDICT_FALSE(col < m_cols))
870 for (; col < m_cols; ++col)
871 {
872 dotprod += row_ptr[col] * aligned_vec[col];
873 } // for col
874
875 *result += dotprod * batch_scaling_factor;
876 } // for row
877 } // for batch
878
879 if (unaligned)
880 {
881 free(aligned_row_free);
882 }
883 free(aligned_vec_free);
884}
885
886inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
887 const float *vector, int n_batch, float *result,
888 int result_stride)
889{
890 // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
891 // vectorized loop, and we need to process sequentially. postamble_start shows
892 // the start index where this should happen.
893 const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
894
895 for (int b = 0; b < n_batch; b++)
896 {
897 float *result_in_batch = result + b * m_rows * result_stride;
898 const float *vector_in_batch = vector + b * m_cols;
899 const float *matrix_row = matrix;
900
901 // Main matrix by vector multiplication loop
902 for (int r = 0; r < m_rows; r++)
903 {
904 float32x4_t acc_32x4 = vmovq_n_f32(0.0);
905 for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane)
906 {
907 // Load 4 float values from vector and matrix row.
908 float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
909 float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
910 // Multiply the vector and matrix row and add to accumulator.
911 acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
912 }
913 // Add the 4 intermediate sum values to get the final dot-prod value for
914 // this column.
915 *result_in_batch += (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
916 vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
917 for (int c = postamble_start; c < m_cols; c++)
918 {
919 *result_in_batch += matrix_row[c] * vector_in_batch[c];
920 }
921 matrix_row += m_cols;
922 result_in_batch += result_stride;
923 }
924 }
925}
926
927inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
928 const int m_rows, const int m_cols,
929 const int8_t *__restrict__ vectors,
930 const float *scaling_factors, int n_batch,
931 int32_t *scratch, float *__restrict__ result,
932 int result_stride, ruy::Context *ruy_context)
933{
934 if (m_rows % 4 == 0 && result_stride == 1)
935 {
936 const int32_t *bias = static_cast<const int32_t *>(nullptr);
937 NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
938 /*output_zp =*/0, scratch, ruy_context);
939
940 // Multiply by float scaling factors and write to result
941 const int total_size = n_batch * m_rows;
942 int i = 0;
943 for (; i <= total_size - 8; i += 8, result += 8 * result_stride)
944 {
945 const float batch_scaling_factor0 = scaling_factors[i / m_rows];
946 const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
947 const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
948 const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
949 const int32x4_t scratch_val0 = vld1q_s32(scratch + i);
950 const int32x4_t scratch_val1 = vld1q_s32(scratch + i + 4);
951 const float32x4_t float_val0 = vcvtq_f32_s32(scratch_val0);
952 const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
953 const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
954 const float32x4_t result1 =
955 vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
956 vst1q_f32(result, result0);
957 vst1q_f32(result + 4 * result_stride, result1);
958 }
959 scratch += i;
960 for (; i < total_size; i++, result += result_stride)
961 {
962 const float batch_scaling_factor = scaling_factors[i / m_rows];
963 int32_t x = *(scratch++);
964 *result += x * batch_scaling_factor;
965 }
966 return;
967 }
968 NeonMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors, scaling_factors, n_batch,
969 result, result_stride);
970}
971
972} // namespace cker
973} // namespace nnfw
974
975#endif // USE_NEON
976
977#endif // __NNFW_CKER_NEON_TENSOR_UTILS_H__
std::uint8_t uint8
Definition Macro.h:52
std::uint32_t uint32
Definition Macro.h:56
std::int16_t int16
Definition Macro.h:53
std::uint16_t uint16
Definition Macro.h:54
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
result
Definition infer.py:103
void MakeRuyMulParams(const GemmParams< std::int32_t, DstScalar, quantization_flavor > &params, ruy::MulParams< std::int32_t, DstScalar > *ruy_mul_params)
Definition RuySupport.h:69
void MakeRuyMatrix(const MatrixParams< Scalar > &params, DataPointer data_ptr, ruy::Matrix< Scalar > *dst, bool use_caching=false)
Definition RuySupport.h:51
Definition topk_v2.h:30
int32_t size[5]
Definition Slice.cpp:35
int32_t int32
Definition topk_v2.h:27