21#include <nonius/nonius.h++>
23#include <arm_compute/core/Types.h>
24#include <arm_compute/runtime/CL/CLScheduler.h>
25#include <arm_compute/runtime/CL/CLFunctions.h>
47 Initializer() { CLScheduler::get().default_init(); }
50Initializer initializer;
52TensorInfo make_info(uint32_t N)
55 return TensorInfo{shape, 1, DataType::F32};
58template <enum Layout> TensorInfo make_info(uint32_t N, uint32_t C, uint32_t H, uint32_t W);
60template <> TensorInfo make_info<NCHW>(uint32_t N, uint32_t C, uint32_t H, uint32_t W)
63 TensorInfo
info{shape, 1, DataType::F32};
64 info.set_data_layout(DataLayout::NCHW);
68template <> TensorInfo make_info<NHWC>(uint32_t N, uint32_t C, uint32_t H, uint32_t W)
71 TensorInfo
info{shape, 1, DataType::F32};
72 info.set_data_layout(DataLayout::NHWC);
76inline void check(
const Status &status)
80 std::cerr << status.error_description() << std::endl;
81 throw std::runtime_error{
"ERROR"};
85inline bool is_odd(uint32_t n) {
return (n % 2 != 0) ? true :
false; }
134 uint32_t vertical_stride;
135 uint32_t horizontal_stride;
138 std::string fused_act;
140 uint32_t top_padding;
141 uint32_t bottom_padding;
142 uint32_t left_padding;
143 uint32_t right_padding;
145 Configuration(nonius::chronometer
meter)
147 ifm_N =
meter.param<BATCH>();
148 ifm_C =
meter.param<IFM_C>();
149 ifm_H =
meter.param<IFM_H>();
150 ifm_W =
meter.param<IFM_W>();
152 ofm_N =
meter.param<BATCH>();
153 ofm_C =
meter.param<OFM_C>();
154 ofm_H =
meter.param<OFM_H>();
155 ofm_W =
meter.param<OFM_W>();
157 ker_N =
meter.param<OFM_C>();
158 ker_C =
meter.param<IFM_C>();
159 ker_H =
meter.param<KER_H>();
160 ker_W =
meter.param<KER_W>();
162 vertical_stride =
meter.param<STRIDE_H>();
163 horizontal_stride =
meter.param<STRIDE_W>();
165 padding =
meter.param<PADDING>();
166 fused_act =
meter.param<FUSED_ACT>();
168 assert((ifm_H - ker_H) % vertical_stride == 0);
169 assert((ifm_W - ker_H) % horizontal_stride == 0);
171 uint32_t
const effective_ofm_H = (ifm_H - ker_H) / vertical_stride + 1;
172 uint32_t
const effective_ofm_W = (ifm_W - ker_H) / horizontal_stride + 1;
174 assert(ofm_H >= effective_ofm_H);
175 assert(ofm_W >= effective_ofm_W);
177 uint32_t
const pad_H = ofm_H - effective_ofm_H;
178 uint32_t
const pad_W = ofm_W - effective_ofm_W;
180 top_padding = pad_H / 2;
181 bottom_padding = pad_H / 2;
182 left_padding = pad_W / 2;
183 right_padding = pad_W / 2;
191 template <Layout L> TensorInfo src_info()
const
193 return make_info<L>(ifm_N, ifm_C, ifm_H, ifm_W);
195 template <Layout L> TensorInfo dst_info()
const
197 return make_info<L>(ofm_N, ofm_C, ofm_H, ofm_W);
199 template <Layout L> TensorInfo ker_info()
const
201 return make_info<L>(ker_N, ker_C, ker_H, ker_W);
203 TensorInfo bias_info(
void)
const {
return make_info(ker_N); }
205 PadStrideInfo pad_stride_info(
void)
const
207 return PadStrideInfo{horizontal_stride,
213 DimensionRoundingType::FLOOR};
225inline nonius::benchmark_registry &local_benchmark_registry()
227 static nonius::benchmark_registry registry;
233#define NONIUS_LOCAL_BENCHMARK(name, ...) \
236 static ::nonius::benchmark_registrar \
237 NONIUS_DETAIL_UNIQUE_NAME(benchmark_registrar)(local_benchmark_registry(), name, __VA_ARGS__); \
233#define NONIUS_LOCAL_BENCHMARK(name, ...) \ …
241 CLDirectConvolutionLayer conv;
259 p.pad_stride_info()));
263 meter.measure([&](
int) {
279 meter.measure([&](
int) {
281 CLScheduler::get().sync();
286 CLDirectConvolutionLayer conv;
304 p.pad_stride_info()));
308 meter.measure([&](
int) {
326 CLScheduler::get().sync();
331 CLGEMMConvolutionLayer conv;
349 p.pad_stride_info()));
353 meter.measure([&](
int) {
369 meter.measure([&](
int) {
371 CLScheduler::get().sync();
376 CLGEMMConvolutionLayer
conv;
394 p.pad_stride_info()));
398 meter.measure([&](
int) {
414 meter.measure([&](
int) {
416 CLScheduler::get().sync();
421 CLWinogradConvolutionLayer conv;
439 p.pad_stride_info()));
443 meter.measure([&](
int) {
459 meter.measure([&](
int) {
461 CLScheduler::get().sync();
466 CLWinogradConvolutionLayer
conv;
484 p.pad_stride_info()));
488 meter.measure([&](
int) {
504 meter.measure([&](
int) {
506 CLScheduler::get().sync();
512 return local_benchmark_registry();
volatile const char info[]
::nncc::core::ADT::tensor::Shape TensorShape
void conv(const nncc::core::ADT::feature::Shape &out_shape, nncc::core::ADT::feature::Accessor< OutputDType > &out_data, const nncc::core::ADT::feature::Shape &in_shape, const nncc::core::ADT::feature::Reader< InputDType > &in_data, const nncc::core::ADT::kernel::Shape &ker_shape, const nncc::core::ADT::kernel::Reader< KernelDType > &ker_data, const PadInfo &pad_info, const StrideInfo &stride_info)