ONE - On-device Neural Engine
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
Transpose.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_TRANSPOSE_H__
19#define __NNFW_CKER_TRANSPOSE_H__
20
21#include "cker/Shape.h"
22#include "cker/Types.h"
23#include "cker/Utils.h"
24
25namespace nnfw
26{
27namespace cker
28{
29namespace reference
30{
31
32template <typename T>
33void TransposeImpl(const TransposeParams &params, const Shape &unextended_input_shape,
34 const T *input_data, const Shape &unextended_output_shape, T *output_data)
35{
36 const int unextended_output_size = unextended_output_shape.DimensionsCount();
37 assert(unextended_input_shape.DimensionsCount() <= 6);
38 assert(unextended_output_size <= 6);
39 assert(unextended_output_size == params.perm_count);
40 const Shape input_shape = Shape::ExtendedShape(6, unextended_input_shape);
41 const Shape output_shape = Shape::ExtendedShape(6, unextended_output_shape);
42 const int input_ext_size = 6 - unextended_input_shape.DimensionsCount();
43 const int output_ext_size = 6 - unextended_output_size;
44
45 int extended_perm[6];
46 for (int i = 0; i < output_ext_size; ++i)
47 {
48 extended_perm[i] = i;
49 }
50 for (int i = 0; i < unextended_output_size; ++i)
51 {
52 extended_perm[i + output_ext_size] = params.perm[i] + input_ext_size;
53 }
54
55 int out_sizes[6];
56 for (int k = 0; k < 6; k++)
57 {
58 out_sizes[k] = MatchingDim(input_shape, extended_perm[k], output_shape, k);
59 }
60
61 int o[6];
62 int i[6];
63 for (o[5] = 0; o[5] < out_sizes[5]; o[5]++)
64 {
65 i[extended_perm[5]] = o[5];
66 for (o[4] = 0; o[4] < out_sizes[4]; o[4]++)
67 {
68 i[extended_perm[4]] = o[4];
69 for (o[3] = 0; o[3] < out_sizes[3]; o[3]++)
70 {
71 i[extended_perm[3]] = o[3];
72 for (o[2] = 0; o[2] < out_sizes[2]; o[2]++)
73 {
74 i[extended_perm[2]] = o[2];
75 for (o[1] = 0; o[1] < out_sizes[1]; o[1]++)
76 {
77 i[extended_perm[1]] = o[1];
78 for (o[0] = 0; o[0] < out_sizes[0]; o[0]++)
79 {
80 i[extended_perm[0]] = o[0];
81 output_data[Offset(output_shape, o)] = input_data[Offset(input_shape, i)];
82 }
83 }
84 }
85 }
86 }
87 }
88}
89
90template <typename T>
91void Transpose(const TransposeParams &params, const Shape &unextended_input_shape,
92 const T *input_data, const Shape &unextended_output_shape, T *output_data)
93{
94 // Transpose kernel only does rearranging values not numeric evaluations on
95 // each cell. It's safe to implement per size of scalar type and this trick
96 // keeps the total code size in a reasonable range.
97 switch (sizeof(T))
98 {
99 case 1:
100 TransposeImpl<int8_t>(params, unextended_input_shape,
101 reinterpret_cast<const int8_t *>(input_data), unextended_output_shape,
102 reinterpret_cast<int8_t *>(output_data));
103 break;
104 case 2:
105 TransposeImpl<int16_t>(params, unextended_input_shape,
106 reinterpret_cast<const int16_t *>(input_data), unextended_output_shape,
107 reinterpret_cast<int16_t *>(output_data));
108 break;
109
110 case 4:
111 TransposeImpl<int32_t>(params, unextended_input_shape,
112 reinterpret_cast<const int32_t *>(input_data), unextended_output_shape,
113 reinterpret_cast<int32_t *>(output_data));
114 break;
115 case 8:
116 TransposeImpl<int64_t>(params, unextended_input_shape,
117 reinterpret_cast<const int64_t *>(input_data), unextended_output_shape,
118 reinterpret_cast<int64_t *>(output_data));
119 break;
120 }
121}
122} // namespace reference
123
124namespace
125{
126
127bool IsTranspose2DApplicable(const TransposeParams &params, const Shape &input_shape, int *dim0,
128 int *dim1)
129{
130 const int dims_cnt = input_shape.DimensionsCount();
131
132 if (dims_cnt == 2)
133 {
134 *dim0 = input_shape.Dims(0);
135 *dim1 = input_shape.Dims(1);
136 return true;
137 }
138
139 const int first_perm = params.perm[0];
140 for (int i = 1; i < dims_cnt; ++i)
141 {
142 int rebased = params.perm[i] - first_perm;
143 if (rebased < 0)
144 {
145 rebased += dims_cnt;
146 }
147 if (rebased != i)
148 {
149 return false;
150 }
151 }
152 *dim0 = 1;
153 *dim1 = 1;
154 for (int i = 0; i < dims_cnt; ++i)
155 {
156 if (i < first_perm)
157 {
158 *dim0 *= input_shape.Dims(i);
159 }
160 else
161 {
162 *dim1 *= input_shape.Dims(i);
163 }
164 }
165 return true;
166}
167
168void RemoveOneSizeDimensions(Shape *input_shape, Shape *output_shape, TransposeParams *params)
169{
170 const int dims_cnt = input_shape->DimensionsCount();
171 assert(params->perm_count == dims_cnt);
172
173 bool foundOneSizeDim = false;
174 for (int i = 0; i < dims_cnt; ++i)
175 {
176 if (input_shape->Dims(i) == 1)
177 {
178 foundOneSizeDim = true;
179 break;
180 }
181 }
182
183 // Return here if there is no one size dimension.
184 if (!foundOneSizeDim)
185 return;
186
187 // Handle the case where all the dimension size is one.
188 if (input_shape->FlatSize() == 1)
189 {
190 input_shape->Resize(1);
191 input_shape->SetDim(0, 1);
192 output_shape->Resize(1);
193 output_shape->SetDim(0, 1);
194 params->perm_count = 1;
195 params->perm[0] = 0;
196 return;
197 }
198
199 // Resize input shape.
200 int new_dims_cnt = 0;
201 for (int i = 0; i < dims_cnt; ++i)
202 {
203 if (input_shape->Dims(i) == 1)
204 {
205 continue;
206 }
207 input_shape->SetDim(new_dims_cnt, input_shape->Dims(i));
208 ++new_dims_cnt;
209 }
210 input_shape->Resize(new_dims_cnt);
211
212 // Resize output shape and re-calculate the perm parameter.
213 TransposeParams new_params;
214 new_dims_cnt = 0;
215 for (int i = 0; i < dims_cnt; ++i)
216 {
217 if (output_shape->Dims(i) == 1)
218 {
219 continue;
220 }
221 new_params.perm[new_dims_cnt] = params->perm[i];
222 output_shape->SetDim(new_dims_cnt, output_shape->Dims(i));
223 ++new_dims_cnt;
224 }
225 output_shape->Resize(new_dims_cnt);
226 new_params.perm_count = new_dims_cnt;
227
228 for (int i = 0; i < new_dims_cnt; ++i)
229 {
230 int min_val_idx = -1;
231 for (int j = 0; j < new_dims_cnt; ++j)
232 {
233 if (new_params.perm[j] >= i &&
234 (min_val_idx == -1 || new_params.perm[min_val_idx] > new_params.perm[j]))
235 {
236 min_val_idx = j;
237 }
238 }
239 new_params.perm[min_val_idx] = i;
240 }
241 *params = new_params;
242}
243
244size_t Flatten(const Shape &input_shape, const Shape &output_shape, const TransposeParams &params,
245 Shape *non_flatten_input_shape, Shape *non_flatten_output_shape,
246 TransposeParams *non_flatten_params)
247{
248 // Calculate the total size of non-flatten dimensions.
249 int skip_dims_cnt = 0;
250 size_t flat_size = input_shape.FlatSize();
251 for (int i = 0; i < params.perm_count; ++i)
252 {
253 if (params.perm[i] == i)
254 {
255 flat_size /= input_shape.Dims(i);
256 ++skip_dims_cnt;
257 }
258 else
259 {
260 break;
261 }
262 }
263
264 // Shrink the shapes and re-calculate the perm parameter.
265 const int new_dims_cnt = params.perm_count - skip_dims_cnt;
266 non_flatten_input_shape->Resize(new_dims_cnt);
267 non_flatten_output_shape->Resize(new_dims_cnt);
268 non_flatten_params->perm_count = new_dims_cnt;
269
270 for (int i = skip_dims_cnt; i < params.perm_count; ++i)
271 {
272 non_flatten_input_shape->SetDim(i - skip_dims_cnt, input_shape.Dims(i));
273 non_flatten_output_shape->SetDim(i - skip_dims_cnt, output_shape.Dims(i));
274 non_flatten_params->perm[i - skip_dims_cnt] = params.perm[i];
275 }
276 for (int i = 0; i < new_dims_cnt; ++i)
277 {
278 int min_val_idx = -1;
279 for (int j = 0; j < new_dims_cnt; ++j)
280 {
281 if (non_flatten_params->perm[j] >= i &&
282 (min_val_idx == -1 ||
283 non_flatten_params->perm[min_val_idx] > non_flatten_params->perm[j]))
284 {
285 min_val_idx = j;
286 }
287 }
288 non_flatten_params->perm[min_val_idx] = i;
289 }
290
291 return flat_size;
292}
293
294} // namespace
295
296// Transpose2D only deals with typical 2D matrix transpose ops.
297// Perform transpose by transposing 4x4 blocks of the input, proceeding from
298// left to right (down the rows) of the input, and then from top to bottom.
299template <typename T>
300inline void Transpose2D(const Shape &input_shape, const T *input_data,
301 [[maybe_unused]] const Shape &output_shape, T *output_data)
302{
303 assert(input_shape.DimensionsCount() == 2);
304 assert(output_shape.DimensionsCount() == 2);
305
306 const int d0 = input_shape.DimsData()[0];
307 const int d1 = input_shape.DimsData()[1];
308 const int kLines = 4;
309 const int kSkipSize = (kLines - 1) * d1;
310
311 const T *input = input_data;
312
313 int i = 0;
314 for (; i <= d0 - kLines; i += kLines)
315 {
316 T *output = output_data + i;
317
318 const T *input_ptr = input;
320 input_ptr += d1;
322 input_ptr += d1;
324 input_ptr += d1;
326
327 int j = 0;
328 for (; j <= d1 - kLines; j += kLines)
329 {
330 input_ptr = input;
331 const T a00 = input_ptr[0];
332 const T a01 = input_ptr[1];
333 const T a02 = input_ptr[2];
334 const T a03 = input_ptr[3];
335 input_ptr += d1;
336 const T a10 = input_ptr[0];
337 const T a11 = input_ptr[1];
338 const T a12 = input_ptr[2];
339 const T a13 = input_ptr[3];
340 input_ptr += d1;
341 const T a20 = input_ptr[0];
342 const T a21 = input_ptr[1];
343 const T a22 = input_ptr[2];
344 const T a23 = input_ptr[3];
345 input_ptr += d1;
346 const T a30 = input_ptr[0];
347 const T a31 = input_ptr[1];
348 const T a32 = input_ptr[2];
349 const T a33 = input_ptr[3];
350
351 output[0] = a00;
352 output[1] = a10;
353 output[2] = a20;
354 output[3] = a30;
355 output += d0;
356
357 output[0] = a01;
358 output[1] = a11;
359 output[2] = a21;
360 output[3] = a31;
361 output += d0;
362
363 output[0] = a02;
364 output[1] = a12;
365 output[2] = a22;
366 output[3] = a32;
367 output += d0;
368
369 output[0] = a03;
370 output[1] = a13;
371 output[2] = a23;
372 output[3] = a33;
373 output += d0;
374
375 input += kLines;
376 }
377 if (j == d1)
378 {
379 input += kSkipSize;
380 }
381 else
382 {
383 for (int p = 0; p < kLines; ++p)
384 {
385 for (int q = 0; q < d1 - j; ++q)
386 {
387 *(output + q * d0 + p) = *(input + p * d1 + q);
388 }
389 }
390 input += (d1 - j) + kSkipSize;
391 }
392 }
393 for (; i < d0; ++i)
394 {
395 T *output = output_data + i;
396 for (int j = 0; j < d1; ++j)
397 {
398 *output = *input;
399 output += d0;
400 ++input;
401 }
402 }
403}
404
405// TODO(alanchiao): see if we can reduce the number
406// of lines of code in branching without affecting latency.
407template <typename T>
408inline void Transpose3D(const TransposeParams &params, const Shape &input_shape,
409 const T *input_data, const Shape &, T *output_data)
410{
411 int s2, s3;
412 s2 = input_shape.Dims(1);
413 s3 = input_shape.Dims(2);
414
415 int p1 = 0;
416 int p2 = 0;
417 int p3 = 0;
418
419 if (params.perm[0] == 2)
420 {
421 p1 = 1;
422 }
423 else if (params.perm[1] == 2)
424 {
425 p2 = 1;
426 }
427 else
428 {
429 p3 = 1;
430 }
431
432 if (params.perm[0] == 1)
433 {
434 p1 = s3;
435 }
436 else if (params.perm[1] == 1)
437 {
438 p2 = s3;
439 }
440 else
441 {
442 p3 = s3;
443 }
444
445 if (params.perm[0] == 0)
446 {
447 p1 = s2 * s3;
448 }
449 else if (params.perm[1] == 0)
450 {
451 p2 = s2 * s3;
452 }
453 else
454 {
455 p3 = s2 * s3;
456 }
457
458 int o_s[3];
459 o_s[0] = input_shape.Dims(params.perm[0]);
460 o_s[1] = input_shape.Dims(params.perm[1]);
461 o_s[2] = input_shape.Dims(params.perm[2]);
462
463 for (int i1 = 0; i1 < o_s[0]; ++i1)
464 {
465 for (int i2 = 0; i2 < o_s[1]; ++i2)
466 {
467 for (int i3 = 0; i3 < o_s[2]; ++i3)
468 {
469 const int i = i1 * p1 + i2 * p2 + i3 * p3;
470 const int o = i1 * o_s[1] * o_s[2] + i2 * o_s[2] + i3;
471 output_data[o] = input_data[i];
472 }
473 }
474 }
475}
476
477template <typename T>
478void TransposeImpl(const TransposeParams &params, const Shape &input_shape, const T *input_data,
479 const Shape &output_shape, T *output_data)
480{
481 const int dims_cnt = input_shape.DimensionsCount();
482
483 int dim0, dim1;
484 if (IsTranspose2DApplicable(params, input_shape, &dim0, &dim1))
485 {
486 Transpose2D(Shape({dim0, dim1}), input_data, Shape({dim1, dim0}), output_data);
487 return;
488 }
489
490 // TODO(b/141217325): notably Eigen is better suited for
491 // larger inputs whereas Transpose3D is generally
492 // better for smaller ones.
493 //
494 // E.g. on Nexus 5, Eigen is better for size 96^3 and up
495 // and Transpose3D is better for 72^3 and down.
496 //
497 // 96^3 is not mobile-friendly for certain usecases
498 // (e.g. model used in beam search for seq2seq) but is in others.
499 // Consider tradeoffs.
500 if (dims_cnt == 3)
501 {
502 Transpose3D(params, input_shape, input_data, output_shape, output_data);
503 return;
504 }
505
506 // Reroute to the reference version if an optimized method for the given data
507 // is not available.
508 reference::Transpose(params, input_shape, input_data, output_shape, output_data);
509}
510
511template <typename T>
512void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape,
513 const T *input_data, const Shape &unshrunk_output_shape, T *output_data)
514{
515 const int output_size = unshrunk_output_shape.DimensionsCount();
516 assert(unshrunk_input_shape.DimensionsCount() <= 6);
517 assert(output_size <= 6);
518 assert(output_size == unshrunk_params.perm_count);
519
520 Shape shrunk_input_shape = Shape(unshrunk_input_shape);
521
522 Shape shrunk_output_shape = Shape(unshrunk_output_shape);
523
524 TransposeParams shrunk_params = unshrunk_params;
525
526 // Reduce any dimensions that have one size. Lower transpose op usually
527 // performs better since memory access patterns will be improved.
528 RemoveOneSizeDimensions(&shrunk_input_shape, &shrunk_output_shape, &shrunk_params);
529
530 // Handle identity cases.
531 // TODO(b/140779653): Add an optimization pass in the conversion process to
532 // remove transpose op nodes where they do nothing like the below one.
533 bool identical = true;
534 for (int i = 0; i < shrunk_params.perm_count; ++i)
535
536 {
537 if (shrunk_params.perm[i] != i)
538
539 {
540 identical = false;
541 break;
542 }
543 }
544 if (identical)
545 {
546 memcpy(output_data, input_data, unshrunk_input_shape.FlatSize() * sizeof(T));
547 return;
548 }
549
550 // Reduce dimensions by flattening.
551 if (shrunk_params.perm[0] == 0 && output_size >= 3)
552
553 {
554 Shape non_flatten_input_shape;
555 Shape non_flatten_output_shape;
556 TransposeParams non_flatten_params;
557 const int total_size = shrunk_input_shape.FlatSize();
558
559 const int non_flatten_size =
560 Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
561
562 &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
563 assert(non_flatten_params.perm[0] != 0);
564
565 for (int i = 0; i < total_size; i += non_flatten_size)
566 {
567 TransposeImpl(non_flatten_params, non_flatten_input_shape, input_data + i,
568 non_flatten_output_shape, output_data + i);
569 }
570 return;
571 }
572
573 // Call non-flattened case.
574 TransposeImpl(shrunk_params, shrunk_input_shape, input_data, shrunk_output_shape,
575
576 output_data);
577}
578
579} // namespace cker
580} // namespace nnfw
581
582#endif // __NNFW_CKER_TRANSPOSE_H__
int32_t DimensionsCount() const
Definition Shape.h:107
int32_t Dims(int i) const
Definition Shape.h:110
int FlatSize() const
Definition Shape.h:249
int32_t * DimsData()
Definition Shape.h:138
const luci_interpreter::RuntimeShape output_shape
void Transpose(const TransposeParams &params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data)
Definition Transpose.h:91
void TransposeImpl(const TransposeParams &params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data)
Definition Transpose.h:33
int MatchingDim(const Shape &shape1, int index1, const Shape &shape2, int index2)
Definition Shape.h:288
int Offset(const Shape &shape, int i0, int i1, int i2, int i3)
Definition Shape.h:308
void TransposeImpl(const TransposeParams &params, const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
Definition Transpose.h:478
void Transpose3D(const TransposeParams &params, const Shape &input_shape, const T *input_data, const Shape &, T *output_data)
Definition Transpose.h:408
void Transpose2D(const Shape &input_shape, const T *input_data, const Shape &output_shape, T *output_data)
Definition Transpose.h:300
void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_input_shape, const T *input_data, const Shape &unshrunk_output_shape, T *output_data)
Definition Transpose.h:512
void optimized_ops_preload_l1_keep(const T *ptr)
Definition Utils.h:455
Definition topk_v2.h:30
Configuration p
Definition Shape.h:28