641{
642 typedef typename Eigen::internal::packet_traits<T>::type Packet;
643 static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
644
645 const int64_t in_depth = in_depth_;
646 const int64_t depth_multiplier = depth_multiplier_;
647 const int64_t out_depth = out_depth_;
648 const int64_t filter_spatial_size = filter_rows * filter_cols;
649
650
651 const int64_t output_vectorized_size = (out_depth / kPacketSize) * kPacketSize;
652 const int64_t output_scalar_size = out_depth % kPacketSize;
653
654
655 const int64_t base_output_index = (in_r * in_cols + in_c) * in_depth;
656
657
658
659 const int64_t dm_vectorized_size = (depth_multiplier / kPacketSize) * kPacketSize;
660 const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
661
662 for (int i = 0; i < output_vectorized_size; i += kPacketSize)
663 {
664
665 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
666 for (int j = 0; j < filter_spatial_size; ++j)
667 {
668
669 const int64_t index = i + j * padded_filter_inner_dim_size;
670
671 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
672
673 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
674
675 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
676 }
677 if (depth_multiplier == 1)
678 {
679
680 Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
681 }
682 else
683 {
684
685 Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
686 }
687 }
688
689 if (output_scalar_size > 0)
690 {
691 auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
692 for (int j = 0; j < filter_spatial_size; ++j)
693 {
694 const int64_t
index = output_vectorized_size + j * padded_filter_inner_dim_size;
695 const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
696 const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
697 vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
698 }
699
701 Eigen::internal::pstoreu<T>(out_buf, vaccum);
702 if (depth_multiplier == 1)
703 {
704
705 for (int j = 0; j < output_scalar_size; ++j)
706 {
707 output[base_output_index + output_vectorized_size + j] = out_buf[j];
708 }
709 }
710 else
711 {
712
713 for (int j = 0; j < output_scalar_size; ++j)
714 {
715 out_buffer[output_vectorized_size + j] = out_buf[j];
716 }
717 }
718 }
719
720
721 if (depth_multiplier > 1)
722 {
723 for (int64_t d = 0; d < in_depth; ++d)
724 {
725 const int64_t
index = d * depth_multiplier;
726 T accum = static_cast<T>(0);
727 for (int64_t dm = 0; dm < dm_vectorized_size; dm +=
kPacketSize)
728 {
729 const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
730 accum += Eigen::internal::predux(v);
731 }
732
733 for (int64_t dm = 0; dm < dm_scalar_size; ++dm)
734 {
735 accum += out_buffer[
index + dm_vectorized_size + dm];
736 }
737
738 output[base_output_index + d] = accum;
739 }
740 }
741}