ONE - On-device Neural Engine
Loading...
Searching...
No Matches
training_ops.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_CKER_EIGEN_TRAINING_OPS_H__
19#define __NNFW_CKER_EIGEN_TRAINING_OPS_H__
20
21// From tensorflow/core/kernels/training_ops.cc
22#define EIGEN_USE_THREADS
23
24#include "unsupported/Eigen/CXX11/Tensor"
26
27// From tensorflow/core/kernels/training_ops.h
28namespace nnfw
29{
30namespace cker
31{
32namespace training_ops
33{
34namespace functor
35{
36
37template <typename Device, typename T> struct ApplyAdam
38{
39 void operator()(const Device &d, typename TTypes<T>::Flat var, typename TTypes<T>::Flat m,
40 typename TTypes<T>::Flat v, typename TTypes<T>::ConstScalar beta1_power,
41 typename TTypes<T>::ConstScalar beta2_power, typename TTypes<T>::ConstScalar lr,
42 typename TTypes<T>::ConstScalar beta1, typename TTypes<T>::ConstScalar beta2,
43 typename TTypes<T>::ConstScalar epsilon, typename TTypes<T>::ConstFlat grad,
44 bool use_nesterov);
45};
46
47// Each training algorithm has a ApplyXYZ functor struct declared in
48// this header file. They are specialized for different devices
49// (CPUDevice in training_ops.cc or GPUDevice in training_ops_gpu.cc).
50template <typename Device, typename T> struct ApplyGradientDescent
51{
52 void operator()(const Device &d, typename TTypes<T>::Flat var,
53 typename TTypes<T>::ConstScalar alpha, typename TTypes<T>::ConstFlat delta);
54};
55
56} // namespace functor
57} // namespace training_ops
58} // namespace cker
59} // namespace nnfw
60
61// From tensorflow/core/kernels/training_ops.cc
62namespace nnfw
63{
64namespace cker
65{
66namespace training_ops
67{
68
69// Enable CPUDevice only for training_ops
70using CPUDevice = Eigen::ThreadPoolDevice;
71using Index = Eigen::Index;
72
73namespace functor
74{
75
76template <typename Device, typename T> struct ApplyAdamNonCuda
77{
78 void operator()(const Device &d, typename TTypes<T>::Flat var, typename TTypes<T>::Flat m,
79 typename TTypes<T>::Flat v, typename TTypes<T>::ConstScalar beta1_power,
80 typename TTypes<T>::ConstScalar beta2_power, typename TTypes<T>::ConstScalar lr,
81 typename TTypes<T>::ConstScalar beta1, typename TTypes<T>::ConstScalar beta2,
82 typename TTypes<T>::ConstScalar epsilon, typename TTypes<T>::ConstFlat grad,
83 bool use_nesterov)
84 {
85 // Get params length and check if they can be vectorized by packet size.
86 Index length = var.size();
87 Index packet_size = Eigen::internal::packet_traits<T>::size;
88 if (length % packet_size == 0)
89 {
90 length = length / packet_size;
91 }
92 else
93 {
94 packet_size = 1;
95 }
96
97 T *var_ptr = var.data();
98 T *m_ptr = m.data();
99 T *v_ptr = v.data();
100 const T *g_ptr = grad.data();
101 const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) / (T(1) - beta1_power());
102 // beta1 == μ
103 // beta2 == ν
104 // v == n
105 // var == θ
106
107 auto shard = [var_ptr, m_ptr, v_ptr, g_ptr, alpha, beta1, beta2, epsilon, use_nesterov,
108 packet_size](int begin, int end) {
109 int t_size = (end - begin) * packet_size;
110 begin = begin * packet_size;
111 auto var = typename TTypes<T>::UnalignedTensor(var_ptr + begin, t_size);
112 auto m = typename TTypes<T>::UnalignedTensor(m_ptr + begin, t_size);
113 auto v = typename TTypes<T>::UnalignedTensor(v_ptr + begin, t_size);
114 auto g = typename TTypes<T>::UnalignedConstTensor(g_ptr + begin, t_size);
115
116 if (use_nesterov)
117 {
118 m += (g - m) * (T(1) - beta1());
119 v += (g.square() - v) * (T(1) - beta2());
120 var -= ((g * (T(1) - beta1()) + beta1() * m) * alpha) / (v.sqrt() + epsilon());
121 }
122 else
123 {
124 m += (g - m) * (T(1) - beta1());
125 v += (g.square() - v) * (T(1) - beta2());
126 var -= (m * alpha) / (v.sqrt() + epsilon());
127 }
128 };
129
130 // Input data: var, v, m, grad.
131 // Output data: var, v, m.
132 const int input_bytes = length * packet_size * sizeof(T) * 4;
133 const int output_bytes = length * packet_size * sizeof(T) * 3;
134 const int compute_cycles =
135 // Consider Sub as Add
136 (Eigen::TensorOpCost::AddCost<int>() * 5 + Eigen::TensorOpCost::MulCost<int>() * 2 +
137 Eigen::TensorOpCost::AddCost<T>() * 10 + Eigen::TensorOpCost::MulCost<T>() * 6 +
138 Eigen::TensorOpCost::DivCost<T>()) *
139 length;
140 const Eigen::TensorOpCost cost(input_bytes, output_bytes, compute_cycles);
141
142 // Eigen device must update 3 variables with 3 different expressions,
143 // which is bad for cache locality on CPU. Here use ParallelFor instead of
144 // "regular" tensor expressions to get better performance.
145 d.parallelFor(length, cost, shard);
146 }
147};
148
149template <typename T> struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T>
150{
151};
152
153template <typename T> struct ApplyGradientDescent<CPUDevice, T>
154{
155 void operator()(const CPUDevice &d, typename TTypes<T>::Flat var,
156 typename TTypes<T>::ConstScalar lr, typename TTypes<T>::ConstFlat grad)
157 {
158 var.device(d) -= grad * lr();
159 }
160};
161
162} // namespace functor
163} // namespace training_ops
164} // namespace cker
165} // namespace nnfw
166
167#endif // __NNFW_CKER_EIGEN_TRAINING_OPS_H__
Eigen::ThreadPoolDevice CPUDevice
ShapeIterator end(const Shape &s)
Definition topk_v2.h:30
int32_t begin[5]
Definition Slice.cpp:33
Eigen::TensorMap< Eigen::TensorFixedSize< const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType >, Eigen::Aligned > ConstScalar
Definition Tensor.h:51
Eigen::TensorMap< Eigen::Tensor< const T, NDIMS, Eigen::RowMajor, IndexType > > UnalignedConstTensor
Definition Tensor.h:40
Eigen::TensorMap< Eigen::Tensor< T, NDIMS, Eigen::RowMajor, IndexType > > UnalignedTensor
Definition Tensor.h:38
Eigen::TensorMap< Eigen::Tensor< const T, 1, Eigen::RowMajor, IndexType >, Eigen::Aligned > ConstFlat
Definition Tensor.h:63
Eigen::TensorMap< Eigen::Tensor< T, 1, Eigen::RowMajor, IndexType >, Eigen::Aligned > Flat
Definition Tensor.h:61
void operator()(const Device &d, typename TTypes< T >::Flat var, typename TTypes< T >::Flat m, typename TTypes< T >::Flat v, typename TTypes< T >::ConstScalar beta1_power, typename TTypes< T >::ConstScalar beta2_power, typename TTypes< T >::ConstScalar lr, typename TTypes< T >::ConstScalar beta1, typename TTypes< T >::ConstScalar beta2, typename TTypes< T >::ConstScalar epsilon, typename TTypes< T >::ConstFlat grad, bool use_nesterov)
void operator()(const Device &d, typename TTypes< T >::Flat var, typename TTypes< T >::Flat m, typename TTypes< T >::Flat v, typename TTypes< T >::ConstScalar beta1_power, typename TTypes< T >::ConstScalar beta2_power, typename TTypes< T >::ConstScalar lr, typename TTypes< T >::ConstScalar beta1, typename TTypes< T >::ConstScalar beta2, typename TTypes< T >::ConstScalar epsilon, typename TTypes< T >::ConstFlat grad, bool use_nesterov)
void operator()(const CPUDevice &d, typename TTypes< T >::Flat var, typename TTypes< T >::ConstScalar lr, typename TTypes< T >::ConstFlat grad)
void operator()(const Device &d, typename TTypes< T >::Flat var, typename TTypes< T >::ConstScalar alpha, typename TTypes< T >::ConstFlat delta)