48 const int input_height = input_shape.
Dims(1);
49 const int input_width = input_shape.
Dims(2);
58 out_mat.setConstant(std::numeric_limits<float>::lowest());
59 for (
int b = 0; b < batches; ++b)
61 for (
int h = 0; h < input_height; ++h)
63 for (
int w = 0; w < input_width; ++w)
71 int h_end = std::min(hpad / stride_height + 1, output_height);
74 int w_end = std::min(wpad / stride_width + 1, output_width);
76 for (
int ph = h_start; ph < h_end; ++ph)
78 for (
int pw = w_start; pw < w_end; ++pw)
80 int out_offset =
NodeOffset(b, ph, pw, output_height, output_width);
81 out_mat.col(out_offset) =
82 out_mat.col(out_offset)
83 .cwiseMax(in_mat.col(
NodeOffset(b, h, w, input_height, input_width)));
90 for (
int i = 0; i < flat_size; ++i)
107 static constexpr int kPoolingAccTrancheSize = 256;
114 const int input_height = input_shape.
Dims(1);
115 const int input_width = input_shape.
Dims(2);
121 uint8_t acc[kPoolingAccTrancheSize];
122 for (
int batch = 0; batch < batches; ++batch)
127 for (
int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
129 const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
130 for (
int out_y = 0; out_y < output_height; ++out_y)
132 for (
int out_x = 0; out_x < output_width; ++out_x)
136 const int filter_x_start = std::max(0, -in_x_origin);
137 const int filter_x_end = std::min(params.
filter_width, input_width - in_x_origin);
138 const int filter_y_start = std::max(0, -in_y_origin);
139 const int filter_y_end = std::min(params.
filter_height, input_height - in_y_origin);
140 memset(acc, 0, tranche_depth *
sizeof(acc[0]));
141 const uint8_t *input_ptr =
142 input_data + depth_base +
143 depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
144 for (
int fy = filter_y_start; fy < filter_y_end; fy++)
146 const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
147 for (
int fx = filter_x_start; fx < filter_x_end; fx++)
149 const uint8_t *input_channel_ptr = input_row_ptr;
152 for (; channel <= tranche_depth - 16; channel += 16)
154 uint8x16_t acc_reg = vld1q_u8(acc + channel);
155 uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
156 input_channel_ptr += 16;
157 acc_reg = vmaxq_u8(acc_reg, input_reg);
158 vst1q_u8(acc + channel, acc_reg);
161 for (; channel <= tranche_depth - 8; channel += 8)
163 uint8x8_t acc_reg = vld1_u8(acc + channel);
164 uint8x8_t input_reg = vld1_u8(input_channel_ptr);
165 input_channel_ptr += 8;
166 acc_reg = vmax_u8(acc_reg, input_reg);
167 vst1_u8(acc + channel, acc_reg);
170 for (; channel < tranche_depth; ++channel)
172 acc[channel] = std::max(acc[channel], *input_channel_ptr++);
174 input_row_ptr += depth;
177 uint8_t *output_ptr = output_data +
Offset(
output_shape, batch, out_y, out_x, depth_base);
180 for (; channel <= tranche_depth - 16; channel += 16)
182 uint8x16_t a = vld1q_u8(acc + channel);
185 vst1q_u8(output_ptr + channel, a);
187 for (; channel <= tranche_depth - 8; channel += 8)
189 uint8x8_t a = vld1_u8(acc + channel);
192 vst1_u8(output_ptr + channel, a);
195 for (; channel < tranche_depth; ++channel)
197 uint8_t a = acc[channel];
200 output_ptr[channel] =
static_cast<uint8_t
>(a);