29inline void BiasAndClamp(
float clamp_min,
float clamp_max,
int bias_size,
const float *bias_data,
30 int array_size,
float *array_data)
37 assert((array_size % bias_size) == 0);
39 float *array_ptr = array_data;
40 float *array_end_ptr = array_ptr + array_size;
41 const auto clamp_min_vec = vdupq_n_f32(clamp_min);
42 const auto clamp_max_vec = vdupq_n_f32(clamp_max);
43 for (; array_ptr != array_end_ptr; array_ptr += bias_size)
46 for (; i <= bias_size - 16; i += 16)
48 auto b0 = vld1q_f32(bias_data + i);
49 auto b1 = vld1q_f32(bias_data + i + 4);
50 auto b2 = vld1q_f32(bias_data + i + 8);
51 auto b3 = vld1q_f32(bias_data + i + 12);
52 auto a0 = vld1q_f32(array_ptr + i);
53 auto a1 = vld1q_f32(array_ptr + i + 4);
54 auto a2 = vld1q_f32(array_ptr + i + 8);
55 auto a3 = vld1q_f32(array_ptr + i + 12);
56 auto x0 = vaddq_f32(a0, b0);
57 auto x1 = vaddq_f32(a1, b1);
58 auto x2 = vaddq_f32(a2, b2);
59 auto x3 = vaddq_f32(a3, b3);
60 x0 = vmaxq_f32(clamp_min_vec, x0);
61 x1 = vmaxq_f32(clamp_min_vec, x1);
62 x2 = vmaxq_f32(clamp_min_vec, x2);
63 x3 = vmaxq_f32(clamp_min_vec, x3);
64 x0 = vminq_f32(clamp_max_vec, x0);
65 x1 = vminq_f32(clamp_max_vec, x1);
66 x2 = vminq_f32(clamp_max_vec, x2);
67 x3 = vminq_f32(clamp_max_vec, x3);
68 vst1q_f32(array_ptr + i, x0);
69 vst1q_f32(array_ptr + i + 4, x1);
70 vst1q_f32(array_ptr + i + 8, x2);
71 vst1q_f32(array_ptr + i + 12, x3);
73 for (; i <= bias_size - 4; i += 4)
75 auto b = vld1q_f32(bias_data + i);
76 auto a = vld1q_f32(array_ptr + i);
77 auto x = vaddq_f32(a, b);
78 x = vmaxq_f32(clamp_min_vec, x);
79 x = vminq_f32(clamp_max_vec, x);
80 vst1q_f32(array_ptr + i, x);
82 for (; i < bias_size; i++)
89 for (
int array_offset = 0; array_offset < array_size; array_offset += bias_size)
91 for (
int i = 0; i < bias_size; i++)
94 array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);