2 #include "caffe2/operators/pool_op.h" 3 #include "caffe2/utils/cpu_neon.h" 14 bool isNeon4x4p0s0Eligible(
41 bool kernelOk = (kH == 4) && (kW == 4);
42 bool strideOk = (strideH == 4) && (strideW == 4);
43 bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
44 bool dilationOk = (dilationH == 1) && (dilationW == 1);
46 bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
47 bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
48 bool alignOk = isPointerAligned(input,
sizeof(float32x4_t)) &&
49 isPointerAligned(output,
sizeof(float32x4_t));
51 return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
56 void avgPoolNeon4x4p0s0Plane(
61 constexpr
int kKernelHeight = 4;
62 constexpr
int kKernelWidth = 4;
63 constexpr
float kDiv = (1.0f / ((float)kKernelHeight * (
float)kKernelWidth));
66 constexpr
int kUnroll = 4;
67 constexpr
int kLoadSizeFloat = (
sizeof(float32x4_t) /
sizeof(
float));
68 constexpr
int kLoadCols = kUnroll * kLoadSizeFloat;
70 if (inputW % kLoadCols == 0) {
75 for (
int h = 0; h < inputH; h += kKernelHeight) {
76 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
77 const float* curInput = input + h * inputW;
79 for (
int w = 0; w < inputW; w += kLoadCols) {
83 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
84 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
85 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
86 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
87 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
88 out = vsetq_lane_f32(v0, out, 0);
90 curInput += kLoadSizeFloat;
93 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
94 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
95 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
96 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
97 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
98 out = vsetq_lane_f32(v0, out, 1);
100 curInput += kLoadSizeFloat;
103 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
104 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
105 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
106 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
107 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
108 out = vsetq_lane_f32(v0, out, 2);
110 curInput += kLoadSizeFloat;
113 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
114 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
115 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
116 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
117 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
118 out = vsetq_lane_f32(v0, out, 3);
120 curInput += kLoadSizeFloat;
122 out = vmulq_f32(out, vdupq_n_f32(kDiv));
123 vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
131 for (
int h = 0; h < inputH; h += kKernelHeight) {
132 const float* inputRow = input + h * inputW;
133 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
135 for (
int w = 0; w < inputW; w += kKernelWidth) {
136 const float* curInput = inputRow + w;
138 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
139 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
140 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
141 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
142 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
143 outputRow[w / kKernelWidth] = v0;
149 void runNeonAveragePool4x4p0s0NCHW(
158 int outputH = inputH / 4;
159 int outputW = inputW / 4;
161 for (
int n = 0; n < N; ++n) {
162 for (
int c = 0; c < C; ++c) {
163 const float* curInput = input + (n * C + c) * inputH * inputW;
164 float* curOutput = output + (n * C + c) * outputH * outputW;
166 avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
171 bool isNeon2x2p0s0Eligible(
198 bool kernelOk = (kH == 2) && (kW == 2);
199 bool strideOk = (strideH == 2) && (strideW == 2);
200 bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
201 bool dilationOk = (dilationH == 1) && (dilationW == 1);
203 bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
204 bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
205 bool alignOk = isPointerAligned(input,
sizeof(float32x4_t)) &&
206 isPointerAligned(output,
sizeof(float32x4_t));
208 return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
213 void maxPoolNeon2x2p0s0Plane(
218 constexpr
int kKernelHeight = 2;
219 constexpr
int kKernelWidth = 2;
222 constexpr
int kUnroll = 4;
223 constexpr
int kLoadSizeFloat = (
sizeof(float32x4_t) /
sizeof(
float));
224 constexpr
int kLoadCols = kUnroll * kLoadSizeFloat;
226 if (inputW % kLoadCols == 0) {
227 for (
int h = 0; h < inputH; h += kKernelHeight) {
228 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
229 const float* curInput = input + h * inputW;
231 for (
int w = 0; w < inputW; w += kLoadCols) {
232 float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
234 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
235 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
236 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
237 hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
239 curInput += kLoadSizeFloat;
241 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
242 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
243 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
244 hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
246 curInput += kLoadSizeFloat;
248 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
249 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
250 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
251 hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
253 curInput += kLoadSizeFloat;
255 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
256 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
257 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
258 hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
260 curInput += kLoadSizeFloat;
262 float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
263 float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
264 vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
265 vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
270 for (
int h = 0; h < inputH; h += kKernelHeight) {
271 const float* inputRow = input + h * inputW;
272 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
274 for (
int w = 0; w < inputW; w += kKernelWidth * 2) {
275 const float* curInput = inputRow + w;
276 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
277 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
278 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
279 float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
280 vst1_f32(&outputRow[w / kKernelWidth], hmax);
286 void runNeonMaxPool2x2p0s0NCHW(
295 int outputH = inputH / 2;
296 int outputW = inputW / 2;
298 for (
int n = 0; n < N; ++n) {
299 for (
int c = 0; c < C; ++c) {
300 const float* curInput = input + (n * C + c) * inputH * inputW;
301 float* curOutput = output + (n * C + c) * outputH * outputW;
302 maxPoolNeon2x2p0s0Plane(inputH, inputW, curInput, curOutput);
306 #endif // __ARM_NEON__ 310 template <
typename T>
313 static float initialize() {
320 ConstEigenMatrixMap<float>& x_mat,
321 EigenMatrixMap<float>& y_mat) {
322 y_mat.col(y_col) += x_mat.col(x_col);
325 static void process(
const T& x_data, T& y_data) {
329 static void finalize(
const int size, T& y_data) {
334 finalize(
const int size,
const int col, EigenMatrixMap<float>& y_mat) {
335 y_mat.col(col) /= size;
338 static bool runSpecialized(
358 if (isNeon4x4p0s0Eligible(
375 runNeonAveragePool4x4p0s0NCHW(N, C, inputH, inputW, input, output);
402 template <
typename T>
405 static float initialize() {
406 return std::numeric_limits<float>::lowest();
412 ConstEigenMatrixMap<float>& x_mat,
413 EigenMatrixMap<float>& y_mat) {
414 y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
417 static void process(
const T& x_data, T& y_data) {
418 if (x_data > y_data) {
423 static void finalize(
const int , T& ) {}
425 static void finalize(
428 EigenMatrixMap<float>& ) {}
430 static bool runSpecialized(
450 if (isNeon2x2p0s0Eligible(
467 runNeonMaxPool2x2p0s0NCHW(N, C, inputH, inputW, input, output);
494 template <
typename T,
class Context,
typename PoolType>
500 const float* Xdata = X.template data<float>();
501 float* Ydata = Y->template mutable_data<float>();
503 int channels = X.dim32(1);
504 int height = X.dim32(2);
505 int width = kernel_.size() > 1 ? X.dim32(3) : 1;
506 int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
507 int pooled_height = Y->dim32(2);
508 int pooled_width = kernel_.size() > 1 ? Y->dim32(3) : 1;
509 int pooled_depth = kernel_.size() > 2 ? Y->dim32(4) : 1;
512 if (kernel_.size() == 2 &&
513 PoolType::runSpecialized(
535 switch (kernel_.size()) {
537 for (
int n = 0; n < X.dim32(0); ++n) {
538 for (
int c = 0; c < channels; ++c) {
539 for (
int ph = 0; ph < pooled_height; ++ph) {
540 int hstart = ph * stride_h() - pad_t();
541 int hend = min(hstart + kernel_h(), height);
542 hstart = max(hstart, 0);
543 T Yh = PoolType::initialize();
544 for (
int h = hstart; h < hend; ++h) {
545 PoolType::process(Xdata[h], Yh);
547 PoolType::finalize(hend - hstart, Yh);
552 Ydata += pooled_height;
557 for (
int n = 0; n < X.dim32(0); ++n) {
558 for (
int c = 0; c < channels; ++c) {
559 for (
int ph = 0; ph < pooled_height; ++ph) {
560 int hstart = ph * stride_h() - pad_t();
561 int hend = min(hstart + kernel_h(), height);
562 hstart = max(hstart, 0);
563 for (
int pw = 0; pw < pooled_width; ++pw) {
564 int wstart = pw * stride_w() - pad_l();
565 int wend = min(wstart + kernel_w(), width);
566 wstart = max(wstart, 0);
567 const int pool_index = ph * pooled_width + pw;
568 T Yh = PoolType::initialize();
569 for (
int h = hstart; h < hend; ++h) {
570 for (
int w = wstart; w < wend; ++w) {
571 const int input_index = h * width + w;
572 PoolType::process(Xdata[input_index], Yh);
575 PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
576 Ydata[pool_index] = Yh;
580 Xdata += height * width;
581 Ydata += pooled_height * pooled_width;
586 for (
int n = 0; n < X.dim32(0); ++n) {
587 for (
int c = 0; c < channels; ++c) {
588 for (
int ph = 0; ph < pooled_height; ++ph) {
589 int hstart = ph * stride_h() - pad_t();
590 int hend = min(hstart + kernel_h(), height);
591 hstart = max(hstart, 0);
592 for (
int pw = 0; pw < pooled_width; ++pw) {
593 int wstart = pw * stride_w() - pad_l();
594 int wend = min(wstart + kernel_w(), width);
595 wstart = max(wstart, 0);
596 for (
int pd = 0; pd < pooled_depth; ++pd) {
597 int dstart = pd * stride_[2] - pads_[2];
598 int dend = min(dstart + kernel_[2], depth);
599 dstart = max(dstart, 0);
600 const int pool_index =
601 ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
602 T Yh = PoolType::initialize();
603 for (
int h = hstart; h < hend; ++h) {
604 for (
int w = wstart; w < wend; ++w) {
605 for (
int d = dstart; d < dend; ++d) {
606 const int input_index = h * width * depth + w * depth + d;
607 PoolType::process(Xdata[input_index], Yh);
612 (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
613 Ydata[pool_index] = Yh;
618 Xdata += height * width * depth;
619 Ydata += pooled_height * pooled_width * pooled_depth;
624 CAFFE_THROW(
"Unsupported pooling size : ", kernel_.size());
630 template <
typename T,
class Context,
typename PoolType>
634 int height = X.dim32(1);
635 int width = kernel_.size() > 1 ? X.dim32(2) : 1;
636 int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
637 int channels = X.dim32(X.ndim() - 1);
640 EigenMatrixMap<float> Ymat(
641 Y->template mutable_data<float>(), channels, Y->size() / channels);
642 ConstEigenMatrixMap<float> Xmat(
643 X.template data<float>(), channels, X.size() / channels);
644 int pooled_height = Y->dim32(1);
645 int pooled_width = kernel_.size() > 1 ? Y->dim32(2) : 1;
646 int pooled_depth = kernel_.size() > 2 ? Y->dim32(3) : 1;
648 switch (kernel_.size()) {
650 for (
int n = 0; n < X.dim32(0); ++n) {
651 for (
int ph = 0; ph < pooled_height; ++ph) {
652 int hstart = ph * stride_h() - pad_t();
653 int hend = min(hstart + kernel_h(), height);
654 hstart = max(hstart, 0);
655 const int y_col = n * pooled_height + ph;
656 Ymat.col(y_col).setConstant(PoolType::initialize());
657 for (
int h = hstart; h < hend; ++h) {
658 const int x_col = n * height + h;
659 PoolType::process(x_col, y_col, Xmat, Ymat);
661 PoolType::finalize((hend - hstart), y_col, Ymat);
666 for (
int n = 0; n < X.dim32(0); ++n) {
667 for (
int ph = 0; ph < pooled_height; ++ph) {
668 int hstart = ph * stride_h() - pad_t();
669 int hend = min(hstart + kernel_h(), height);
670 hstart = max(hstart, 0);
671 for (
int pw = 0; pw < pooled_width; ++pw) {
672 int wstart = pw * stride_w() - pad_l();
673 int wend = min(wstart + kernel_w(), width);
674 wstart = max(wstart, 0);
675 const int y_col = (n * pooled_height + ph) * pooled_width + pw;
676 Ymat.col(y_col).setConstant(PoolType::initialize());
677 for (
int h = hstart; h < hend; ++h) {
678 for (
int w = wstart; w < wend; ++w) {
679 const int x_col = (n * height + h) * width + w;
680 PoolType::process(x_col, y_col, Xmat, Ymat);
683 PoolType::finalize((hend - hstart) * (wend - wstart), y_col, Ymat);
689 for (
int n = 0; n < X.dim32(0); ++n) {
690 for (
int ph = 0; ph < pooled_height; ++ph) {
691 int hstart = ph * stride_h() - pad_t();
692 int hend = min(hstart + kernel_h(), height);
693 hstart = max(hstart, 0);
694 for (
int pw = 0; pw < pooled_width; ++pw) {
695 int wstart = pw * stride_w() - pad_l();
696 int wend = min(wstart + kernel_w(), width);
697 wstart = max(wstart, 0);
698 for (
int pd = 0; pd < pooled_depth; ++pd) {
699 int dstart = pd * stride_[2] - pads_[2];
700 int dend = min(dstart + kernel_[2], depth);
701 dstart = max(dstart, 0);
702 const int y_col = ((n * pooled_height + ph) * pooled_width + pw) *
705 Ymat.col(y_col).setConstant(PoolType::initialize());
706 for (
int h = hstart; h < hend; ++h) {
707 for (
int w = wstart; w < wend; ++w) {
708 for (
int d = dstart; d < dend; ++d) {
710 ((n * height + h) * width + w) * depth + d;
711 PoolType::process(x_col, y_col, Xmat, Ymat);
716 (hend - hstart) * (wend - wstart) * (dend - dstart),
725 CAFFE_THROW(
"Unsupported pooling size : ", kernel_.size());
730 const char* kAveragePoolDoc = R
"DOC( 731 consumes an input blob X and applies average pooling across the 732 the blob according to kernel sizes, stride sizes, and pad lengths defined by the 733 ConvPoolOpBase operator. Average pooling consisting of averaging all values of a 734 subset of the input tensor according to the kernel size and downsampling the 735 data into the output blob Y for further processing. 738 const char* kMaxPoolDoc = R
"DOC( 739 consumes an input blob X and applies max pooling across the 740 the blob according to kernel sizes, stride sizes, and pad lengths defined by the 741 ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a 742 subset of the input tensor according to the kernel size and downsampling the 743 data into the output blob Y for further processing. 746 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
748 string doc =
"AveragePool{dim} {pool_doc}";
749 ReplaceAll(doc,
"{dim}", dim);
750 ReplaceAll(doc,
"{pool_doc}", kAveragePoolDoc);
755 "Input data tensor from the previous operator; dimensions depend on " 756 "whether the NCHW or NHWC operators are being used. For example, in " 757 "the former, the input has size (N x C x H x W), where N is the batch " 758 "size, C is the number of channels, and H and W are the height and the " 759 "width of the data. The corresponding permutation of dimensions is " 760 "used in the latter case.");
764 "Output data tensor from average pooling across the input " 765 "tensor. Dimensions will vary based on various kernel, stride, and pad " 770 std::function<void(OpSchema&)> MaxPoolDocGenerator(
const char* dim) {
772 string doc =
"MaxPool{dim} {pool_doc}";
773 ReplaceAll(doc,
"{dim}", dim);
774 ReplaceAll(doc,
"{pool_doc}", kMaxPoolDoc);
779 "Input data tensor from the previous operator; dimensions depend on " 780 "whether the NCHW or NHWC operators are being used. For example, in " 781 "the former, the input has size (N x C x H x W), where N is the batch " 782 "size, C is the number of channels, and H and W are the height and the " 783 "width of the data. The corresponding permutation of dimensions is " 784 "used in the latter case.");
788 "Output data tensor from max pooling across the input " 789 "tensor. Dimensions will vary based on various kernel, stride, and pad " 793 REGISTER_CPU_OPERATOR(
797 OPERATOR_SCHEMA(AveragePool)
801 .FillUsing(AveragePoolDocGenerator(
""))
802 .InheritOnnxSchema(
"AveragePool");
804 REGISTER_CPU_OPERATOR(
808 OPERATOR_SCHEMA(AveragePool1D)
812 .FillUsing(AveragePoolDocGenerator(
"1D"))
813 .InheritOnnxSchema(
"AveragePool");
815 REGISTER_CPU_OPERATOR(
819 OPERATOR_SCHEMA(AveragePool2D)
823 .FillUsing(AveragePoolDocGenerator(
"2D"))
824 .InheritOnnxSchema(
"AveragePool");
826 REGISTER_CPU_OPERATOR(
830 OPERATOR_SCHEMA(AveragePool3D)
834 .FillUsing(AveragePoolDocGenerator(
"3D"))
835 .InheritOnnxSchema(
"AveragePool");
839 OPERATOR_SCHEMA(MaxPool)
843 .FillUsing(MaxPoolDocGenerator(
""))
844 .InheritOnnxSchema(
"MaxPool");
846 REGISTER_CPU_OPERATOR(MaxPool1D,
PoolOp<
float,
CPUContext, MaxPool<float>>);
848 OPERATOR_SCHEMA(MaxPool1D)
852 .FillUsing(MaxPoolDocGenerator(
"1D"))
853 .InheritOnnxSchema(
"MaxPool");
855 REGISTER_CPU_OPERATOR(MaxPool2D,
PoolOp<
float,
CPUContext, MaxPool<float>>);
857 OPERATOR_SCHEMA(MaxPool2D)
861 .FillUsing(MaxPoolDocGenerator(
"2D"))
862 .InheritOnnxSchema(
"MaxPool");
864 REGISTER_CPU_OPERATOR(MaxPool3D,
PoolOp<
float,
CPUContext, MaxPool<float>>);
866 OPERATOR_SCHEMA(MaxPool3D)
870 .FillUsing(MaxPoolDocGenerator(
"3D"))
871 .InheritOnnxSchema(
"MaxPool");
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...