ONE - On-device Neural Engine
Loading...
Searching...
No Matches
FusedBatchNorm.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_FUSEDBATCHNORM_H__
19#define __NNFW_CKER_FUSEDBATCHNORM_H__
20
21#include "cker/Types.h"
22#include "cker/Shape.h"
23#include "cker/Utils.h"
24
26
27#include "Transpose.h"
28#include "BatchMatMul.h"
29
30#include <string>
31#include <vector>
32#include <map>
33#include <numeric>
34#include <algorithm>
35
36namespace nnfw
37{
38namespace cker
39{
40
42{
43public:
44 FusedBatchNorm() : _prepared(false)
45 {
46 // DO NOTHING
47 }
48
49 void prepare() { _prepared = true; }
50
51 void operator()(const std::vector<Shape> &input_shapes,
52 const std::vector<const float *> &input_data, const Shape &output_shape,
53 float *output_data, FusedBatchNormParams param)
54 {
55 // TODO: support fused_batch_norm if is_traninig is false
56 assert(param.is_training == true);
57
58 // TODO: support case where dim[1] != 1 or dim[3] !=1.
59 // Here we only support input tensor of [B, 1, X, 1] shape
60 assert(input_shapes[0].Dims(1) == 1 && input_shapes[0].Dims(3) == 1);
61
62 if (!_prepared)
63
64 {
65 prepare();
66 }
67
68 Tensor transformed_input[5];
69 Tensor transformed_output;
70
71 const int num_inputs = input_shapes.size();
72 std::vector<InputTensor<float>> inputs(num_inputs);
73 for (int i = 0; i < num_inputs; i++)
74 {
75 inputs[i].shape.ReplaceWith(input_shapes[i].DimensionsCount(), input_shapes[i].DimsData());
76 inputs[i].buffer = input_data[i];
77 copyFrom<float>(inputs[i], inputs[i].shape, &transformed_input[i]);
78 }
79
80 InputTensor<float> output;
81 output.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
82 output.buffer = output_data;
83 copyFrom<float>(output, output.shape, &transformed_output);
84
85 // TODO: support transpose if data_format is NCHW
86 // Here, Eigen use RowMajor kernel(NHWC)
87
88 typename TTypes<float, 4>::Tensor x(transformed_input[0].shaped<float, 4>());
89 typename TTypes<float, 4>::Tensor y(transformed_output.shaped<float, 4>());
90 typename TTypes<float, 1>::Tensor scale(transformed_input[1].shaped<float, 1>());
91 typename TTypes<float, 1>::Tensor offset(transformed_input[2].shaped<float, 1>());
92
93 const int depth = x.dimension(3);
94 const int size = x.size();
95 const int rest_size = size / depth;
96 Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth);
97
98 Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth);
99 Eigen::array<int, 1> reduce_dims({0});
100 Eigen::array<int, 2> bcast_spec({rest_size, 1});
101
102 auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<float>();
103 const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
104 float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
105 // This adjustment is for Bessel's correction
106 [[maybe_unused]] float rest_size_adjust =
107 static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
108
109 Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
110 Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
111
112 const Eigen::ThreadPoolDevice &d = *eigen_support::GetThreadPoolDevice();
113
114 batch_mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
115 auto x_centered = x_rest_by_depth - batch_mean.reshape(one_by_depth).broadcast(bcast_spec);
116
117 batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
118 auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
119 .eval()
120 .reshape(one_by_depth)
121 .broadcast(bcast_spec);
122 auto x_scaled = x_centered * scaling_factor;
123 auto x_shifted =
124 (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
125
126 y.reshape(rest_by_depth).device(d) = x_shifted;
127
128 memcpy(output_data, y.data(), output_shape.FlatSize() * sizeof(float));
129 }
130
131 template <typename T>
132 void copyFrom(const InputTensor<T> &input, const Shape &shape, Tensor *output)
133 {
134 Tensor temp_tensor;
135 temp_tensor.shape.ReplaceWith(input.shape.DimensionsCount(), input.shape.DimsData());
136 temp_operand.emplace_back(std::make_unique<float[]>(input.shape.FlatSize()));
137 temp_tensor.buffer = temp_operand.back().get();
138 memcpy(temp_tensor.buffer, input.buffer, input.shape.FlatSize() * sizeof(float));
139
140 copyFrom(temp_tensor, shape, output);
141 }
142
143 void copyFrom(const Tensor &input, const Shape &shape, Tensor *output)
144 {
145 if (output->copyFrom(input, shape))
146 return;
147
148 throw std::runtime_error{"FusedBatchNorm: Encountered error while reshaping a Tensor"};
149 }
150
151private:
152 bool _prepared;
153 std::vector<std::unique_ptr<float[]>> temp_operand;
154};
155
156} // namespace cker
157} // namespace nnfw
158
159#endif // __NNFW_CKER_FUSEDBATCHNORM_H__
void operator()(const std::vector< Shape > &input_shapes, const std::vector< const float * > &input_data, const Shape &output_shape, float *output_data, FusedBatchNormParams param)
void copyFrom(const InputTensor< T > &input, const Shape &shape, Tensor *output)
void copyFrom(const Tensor &input, const Shape &shape, Tensor *output)
void ReplaceWith(int dimensions_count, const int32_t *dims_data)
Definition Shape.h:206
__global uchar * offset(const Image *img, int x, int y)
Definition helpers.h:540
const luci_interpreter::RuntimeShape output_shape
const Eigen::ThreadPoolDevice * GetThreadPoolDevice()
Definition topk_v2.h:30
int32_t size[5]
Definition Slice.cpp:35
Definition Dims.h:26
Eigen::TensorMap< Eigen::Tensor< T, NDIMS, Eigen::RowMajor, IndexType >, Eigen::Aligned > Tensor
Definition Tensor.h:32
TTypes< T, NDIMS >::Tensor shaped(const std::vector< int32_t > &new_sizes)
Definition Tensor.h:117