ONE - On-device Neural Engine
Loading...
Searching...
No Matches
redux_functor.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef __NNFW_CKER_EIGEN_REDUX_FUNCTOR_H__
17#define __NNFW_CKER_EIGEN_REDUX_FUNCTOR_H__
18
20
21// From tensorflow/core/kernels/redux_functor.h
22namespace nnfw
23{
24namespace cker
25{
26namespace functor
27{
28
29// Compute reduction over outer dimensions.
30// Example:
31// input: [D1, D2, ... , DN]
32// ->
33// output: [Di, ... , DN] where i belongs to set [1,N]
34template <typename Device, typename InputT, typename AccumT, typename OutputT,
35 typename BinaryFunctor>
37{
39
40 template <int num_dims>
41 void operator()(const Device &device, const Eigen::DSizes<Eigen::Index, num_dims> &input_dims,
42 const Tensor &input, Tensor *output) const
43 {
44 // Compute inner and outer dim after reshaping into 2d tensor.
45 const int num_output_dims = output->shape.DimensionsCount();
46 auto output_dims = output->template flat<OutputT>().dimensions();
47
48 Eigen::Index inner_dim = 1, outer_dim = 1;
49 for (int i = 0; i < num_dims - num_output_dims; ++i)
50 outer_dim *= input_dims[i];
51 for (int i = num_dims - num_output_dims; i < num_dims; ++i)
52 inner_dim *= input_dims[i];
53
54 if (1 == outer_dim)
55 {
56 // Nothing to do but passing input to output.
57 output->template flat<OutputT>() =
58 input.template flat<InputT>().template cast<OutputT>().reshape(output_dims);
59 return;
60 }
61
62 // Get device thread num.
63 const Eigen::Index num_threads = device.numThreads();
64
65 // If the inner dim parallelism is large enough
66 // TODO(ezhulenev): There seems to be no benefits in going this route. Check
67 // if this can be improved, or use better heuristic?
68 if (inner_dim > num_threads * 32)
69 {
70 // Do not create more blocks than there are threads in a pool.
71 const Eigen::Index num_blocks = num_threads;
72
73 // Block size along the outer dimension.
74 const Eigen::Index inner_block_size = Eigen::divup(inner_dim, num_blocks);
75 const InputT *input_data = input.template flat<InputT>().data();
76
77 // Allocate temporary buffer for partial reductions.
78 Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index> buffer({inner_dim});
79 buffer.setZero();
80 AccumT *buffer_data = buffer.data();
81
82 using Buffer =
83 Eigen::TensorMap<Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>, Eigen::Unaligned>;
84
85 using Input = Eigen::TensorMap<Eigen::Tensor<const InputT, 1, Eigen::RowMajor, Eigen::Index>,
86 Eigen::Unaligned>;
87
88 const auto compute = [inner_dim, outer_dim, inner_block_size, input_data,
89 buffer_data](Eigen::Index start, Eigen::Index limit) -> void {
90 Eigen::Index inner_dim_start = start * inner_block_size;
91 Eigen::Index inner_dim_limit = limit * inner_block_size;
92 inner_dim_limit = std::min(inner_dim, inner_dim_limit);
93 Eigen::Index my_job_len = inner_dim_limit - inner_dim_start;
94
95 const InputT *my_job_start = input_data + inner_dim_start;
96 Buffer buf(buffer_data + inner_dim_start, my_job_len);
97
98 for (Eigen::Index i = 0; i < outer_dim; ++i)
99 {
100 auto in = Input(my_job_start + i * inner_dim, my_job_len);
101 auto cast = in.template cast<AccumT>();
102 buf =
103 Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf), const decltype(cast)>(
104 buf, cast);
105 }
106 };
107
108 // Compute cost of reducing a single block.
109 const Eigen::Index compute_size = outer_dim * inner_block_size;
110 const Eigen::Index compute_input_bytes = compute_size * sizeof(InputT);
111 const Eigen::TensorOpCost cost(compute_input_bytes,
112 0, // We'll be mostly writing to L1, assume store cost is 0
113 compute_size *
114 Eigen::internal::functor_traits<BinaryFunctor>::Cost);
115
116 device.parallelFor(num_blocks, cost, compute);
117
118 // Write final result to the output.
119 output->template flat<OutputT>() = buffer.template cast<OutputT>().reshape(output_dims);
120 }
121 else
122 {
123 // Compute block size along the outer dimension for efficiency.
124 const Eigen::Index parallel_cell_size = inner_dim;
125 const Eigen::Index total_workload = outer_dim * inner_dim;
126 const Eigen::Index max_parallelism = total_workload / parallel_cell_size;
127
128 const Eigen::Index min_block_workload = 2000;
129 const Eigen::Index min_block_size = Eigen::divup(min_block_workload, parallel_cell_size);
130 const Eigen::Index max_num_blocks =
131 std::min(max_parallelism, Eigen::divup(total_workload, min_block_size));
132
133 // Do not create more blocks than there are threads in a pool.
134 const Eigen::Index num_blocks = std::min(max_num_blocks, num_threads);
135
136 // Block size along the outer dimension.
137 const Eigen::Index outer_block_size = Eigen::divup(outer_dim, num_blocks);
138
139 const InputT *input_data = input.template flat<InputT>().data();
140
141 // Allocate temporary buffer for partial reductions.
142 std::vector<AccumT> buffer(num_blocks * inner_dim);
143 AccumT *buffer_data = buffer.data();
144
145 using Buffer =
146 Eigen::TensorMap<Eigen::Tensor<AccumT, 1, Eigen::RowMajor, Eigen::Index>, Eigen::Unaligned>;
147
148 using Input = Eigen::TensorMap<Eigen::Tensor<const InputT, 1, Eigen::RowMajor, Eigen::Index>,
149 Eigen::Unaligned>;
150
151 const auto compute = [inner_dim, outer_block_size, buffer_data, input_data,
152 outer_dim](Eigen::Index start, Eigen::Index limit) -> void {
153 Eigen::Index outer_dim_start = start * outer_block_size;
154 Eigen::Index outer_dim_limit = limit * outer_block_size;
155 outer_dim_limit = std::min(outer_dim, outer_dim_limit);
156
157 Buffer buf(buffer_data + start * inner_dim, inner_dim);
158 for (Eigen::Index i = outer_dim_start; i < outer_dim_limit; ++i)
159 {
160 auto in = Input(input_data + i * inner_dim, inner_dim);
161 auto cast = in.template cast<AccumT>();
162 buf =
163 Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf), const decltype(cast)>(
164 buf, cast);
165 }
166 };
167
168 // Compute cost of reducing a single block.
169 const Eigen::Index compute_size = outer_block_size * inner_dim;
170 const Eigen::Index compute_input_bytes = compute_size * sizeof(InputT);
171 const Eigen::TensorOpCost cost(compute_input_bytes,
172 0, // We'll be mostly writing to L1, assume store cost is 0
173 compute_size *
174 Eigen::internal::functor_traits<BinaryFunctor>::Cost);
175
176 device.parallelFor(num_blocks, cost, compute);
177
178 // Aggregate partial results from temporary buffer into first block.
179 auto buf0 = Buffer(buffer_data, inner_dim);
180 // Just sum the buffer up, as inner dimensions is not large in this case.
181 for (int i = 1; i < num_blocks; ++i)
182 {
183 auto buf = Buffer(buffer_data + i * inner_dim, inner_dim);
184 buf0 = Eigen::TensorCwiseBinaryOp<BinaryFunctor, const decltype(buf0), const decltype(buf)>(
185 buf0, buf);
186 }
187 // Write final result to the output.
188 output->template flat<OutputT>() = buf0.template cast<OutputT>().reshape(output_dims);
189 }
190 }
191};
192
193void biasReductionHelper(float *input_backprop_buffer, const Shape &input_backprop_shape,
194 float *bias_grad_buffer, const Shape &bias_grad_shape)
195{
196 assert(input_backprop_buffer);
197 assert(bias_grad_buffer);
198
199 const nnfw::cker::functor::ReduceOuterDimensions<Eigen::ThreadPoolDevice, float, float, float,
200 Eigen::internal::scalar_sum_op<float>>
201 redux;
202
203 const Tensor input_backprop_t{input_backprop_shape, static_cast<void *>(input_backprop_buffer)};
204
205 Tensor bias_grad_t{bias_grad_shape, bias_grad_buffer};
206
207 int outer = 1;
208 for (int i = 0; i < input_backprop_shape.DimensionsCount() - 1; ++i)
209 outer *= input_backprop_shape.Dims(i);
210 int inner = input_backprop_shape.Dims(input_backprop_shape.DimensionsCount() - 1);
211
212 redux(*eigen_support::GetThreadPoolDevice(), Eigen::DSizes<Eigen::Index, 2>{outer, inner},
213 input_backprop_t, &bias_grad_t);
214}
215
216} // namespace functor
217} // namespace cker
218} // namespace nnfw
219
220#endif // __NNFW_CKER_EIGEN_REDUX_FUNCTOR_H__
int32_t DimensionsCount() const
Definition Shape.h:91
int32_t Dims(int i) const
Definition Shape.h:92
const Eigen::ThreadPoolDevice * GetThreadPoolDevice()
void biasReductionHelper(float *input_backprop_buffer, const Shape &input_backprop_shape, float *bias_grad_buffer, const Shape &bias_grad_shape)
Definition topk_v2.h:30
void operator()(const Device &device, const Eigen::DSizes< Eigen::Index, num_dims > &input_dims, const Tensor &input, Tensor *output) const