ONE - On-device Neural Engine
Loading...
Searching...
No Matches
NeonTensorUtils.h
Go to the documentation of this file.
1/*
2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
3 * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18#ifndef __NNFW_RUY_NEON_TENSOR_UTILS_H__
19#define __NNFW_RUY_NEON_TENSOR_UTILS_H__
20
21#include "ruy/neon/neon_check.h"
22
23#ifdef USE_NEON
24
25#define kFloatWeightsPerNeonLane 4
26
27namespace nnfw
28{
29namespace ruy
30{
31
32inline bool NeonIsZeroVector(const float *vector, int v_size)
33{
34 // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
35 // use the main vectorized loop, and we need to process sequentially.
36 // postamble_start shows the start index where this should happen.
37 const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
38
39 const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
40 for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
41 {
42 const float32x4_t i_x4_float = vld1q_f32(vector + v);
43 uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
44 if (vgetq_lane_u32(cmp_result, 0) == 0)
45 return false;
46 if (vgetq_lane_u32(cmp_result, 1) == 0)
47 return false;
48 if (vgetq_lane_u32(cmp_result, 2) == 0)
49 return false;
50 if (vgetq_lane_u32(cmp_result, 3) == 0)
51 return false;
52 }
53
54 // Postamble loop
55 for (int v = postamble_start; v < v_size; ++v)
56 {
57 if (vector[v] != 0.0)
58 return false;
59 }
60 return true;
61}
62
63} // namespace ruy
64} // namespace nnfw
65
66#endif // USE_NEON
67
68#endif // __NNFW_RUY_NEON_TENSOR_UTILS_H__
Definition topk_v2.h:30