367 {
368 typedef typename Eigen::internal::packet_traits<T>::type Packet;
369 static const int64_t
kPacketSize = (
sizeof(Packet) /
sizeof(T));
370
371 const int64_t filter_spatial_size = static_cast<int64_t>(filter_rows) * filter_cols;
372 const int64_t output_scalar_size = out_depth %
kPacketSize;
373 const int64_t output_vectorized_size = (out_depth /
kPacketSize) * kPacketSize;
374 const int64_t base_output_index = (out_r * out_cols + out_c) * out_depth;
375
376 for (
int i = 0; i < output_vectorized_size; i +=
kPacketSize)
377 {
378
379 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
380 for (int j = 0; j < filter_spatial_size; ++j)
381 {
382
383 const int64_t
index = i + j * padded_filter_inner_dim_size;
384
385
386
387 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
388
389 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
390
391 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
392 }
393
394 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
395 }
396
397 if (output_scalar_size > 0)
398 {
399 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
400 for (int j = 0; j < filter_spatial_size; ++j)
401 {
402 const int64_t
index = output_vectorized_size + j * padded_filter_inner_dim_size;
403 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
404 const auto data_block = Eigen::internal::ploadu<Packet>(input_buffer + index);
405 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
406 }
407
409 Eigen::internal::pstoreu<T>(out_buf, vaccum);
410 const int64_t last_output_index = base_output_index + output_vectorized_size;
411 for (int j = 0; j < output_scalar_size; ++j)
412 {
413 output[last_output_index + j] = out_buf[j];
414 }
415 }
416 }
loco::GraphInputIndex index(const TFPlaceholder *node)