110{
111 ARM_COMPUTE_UNUSED(
info);
112 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
113 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
114 ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
115 ARM_COMPUTE_ERROR_ON(_input == _output);
116
117 const auto window_start_x = static_cast<int>(window.x().start());
118 const auto window_end_x = static_cast<int>(window.x().end());
119 const int window_step_x = 16;
120
121 Window win{window};
122 win.set(Window::DimX, Window::Dimension(0, 1, 1));
123
124 Iterator
input(_input, win);
125 Iterator
output(_output, win);
126
127 const uint8_t true_val = 1;
128 const uint8x8_t mask_bool = vdup_n_u8(true_val);
129
130 switch (_output->info()->data_type())
131 {
132 case DataType::S8:
133 {
134
135 execute_window_loop(
136 win,
137 [&](const Coordinates &) {
138 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
139 const auto output_ptr =
reinterpret_cast<int8_t *
>(
output.ptr());
140
141 int x = window_start_x;
142 for (; x <= (window_end_x - window_step_x); x += window_step_x)
143 {
144 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
145
146 vst1q_s8(output_ptr + x,
147 vreinterpretq_s8_u8(vandq_u8(texels_u8, vdupq_n_u8(true_val))));
148 }
149
150
151 for (; x < window_end_x; ++x)
152 {
153 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) & true_val);
154 }
155 },
157 break;
158 }
159 case DataType::S16:
160 {
161
162 execute_window_loop(
163 win,
164 [&](const Coordinates &) {
165 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
166 const auto output_ptr =
reinterpret_cast<int16_t *
>(
output.ptr());
167
168 int x = window_start_x;
169 for (; x <= (window_end_x - window_step_x); x += window_step_x)
170 {
171 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
172
173 const int16x8x2_t texels = {
174 {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
175 vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
176
177 vst1q_s16(output_ptr + x, texels.val[0]);
178 vst1q_s16(output_ptr + x + 8, texels.val[1]);
179 }
180
181
182 for (; x < window_end_x; ++x)
183 {
184 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) & true_val);
185 }
186 },
188 break;
189 }
190 case DataType::S32:
191 {
192
193 execute_window_loop(
194 win,
195 [&](const Coordinates &) {
196 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
197 const auto output_ptr =
reinterpret_cast<int32_t *
>(
output.ptr());
198
199 int x = window_start_x;
200 for (; x <= (window_end_x - window_step_x); x += window_step_x)
201 {
202 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
203
204 const int16x8x2_t texels = {
205 {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
206 vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
207
208 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
209 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
210 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
211 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
212 }
213
214
215 for (; x < window_end_x; ++x)
216 {
217 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) & true_val);
218 }
219 },
221 break;
222 }
223 case DataType::F32:
224 {
225
226 execute_window_loop(
227 win,
228 [&](const Coordinates &) {
229 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
230 const auto output_ptr =
reinterpret_cast<float *
>(
output.ptr());
231
232 int x = window_start_x;
233 for (; x <= (window_end_x - window_step_x); x += window_step_x)
234 {
235 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
236
237 const int16x8x2_t texels = {
238 {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
239 vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
240 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
241 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
242 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
243 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
244 }
245
246
247 for (; x < window_end_x; ++x)
248 {
249 auto in = static_cast<uint32_t>(*(input_ptr + x) & true_val);
250 *(output_ptr + x) = static_cast<float>(in);
251 }
252 },
254 break;
255 }
256#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
257 case DataType::F16:
258 {
259
260 execute_window_loop(
261 win,
262 [&](const Coordinates &) {
263 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
264 const auto output_ptr =
reinterpret_cast<float16_t *
>(
output.ptr());
265
266 int x = window_start_x;
267 for (; x <= (window_end_x - window_step_x); x += window_step_x)
268 {
269 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
270
271 const int16x8x2_t texels = {
272 {vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool))),
273 vreinterpretq_s16_u16(vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool)))}};
274 vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
275 vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
276 }
277
278
279 for (; x < window_end_x; ++x)
280 {
281 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) & true_val);
282 }
283 },
285 break;
286 }
287#endif
288 case DataType::U8:
289 {
290
291 execute_window_loop(
292 win,
293 [&](const Coordinates &) {
294 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
295 const auto output_ptr =
reinterpret_cast<uint8_t *
>(
output.ptr());
296
297 int x = window_start_x;
298 for (; x <= (window_end_x - window_step_x); x += window_step_x)
299 {
300 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
301
302 vst1q_u8(output_ptr + x, vandq_u8(texels_u8, vdupq_n_u8(true_val)));
303 }
304
305
306 for (; x < window_end_x; ++x)
307 {
308 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) & true_val);
309 }
310 },
312 break;
313 }
314 case DataType::U16:
315 {
316
317 execute_window_loop(
318 win,
319 [&](const Coordinates &) {
320 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
321 const auto output_ptr =
reinterpret_cast<uint16_t *
>(
output.ptr());
322
323 int x = window_start_x;
324 for (; x <= (window_end_x - window_step_x); x += window_step_x)
325 {
326 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
327
328 const uint16x8x2_t texels = {{vmovl_u8(vand_u8(vget_low_u8(texels_u8), mask_bool)),
329 vmovl_u8(vand_u8(vget_high_u8(texels_u8), mask_bool))}};
330
331 vst1q_u16(output_ptr + x, texels.val[0]);
332 vst1q_u16(output_ptr + x + 8, texels.val[1]);
333 }
334
335
336 for (; x < window_end_x; ++x)
337 {
338 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x) & true_val);
339 }
340 },
342 break;
343 }
344 default:
345 ARM_COMPUTE_ERROR("Output data type not supported");
346 }
347}
volatile const char info[]