Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op.cc
1 // TODO(ataei): reduce the apparent redundancy of all the code below.
2 #include "caffe2/operators/pool_op.h"
3 #include "caffe2/utils/cpu_neon.h"
4 
5 namespace caffe2 {
6 
7 using std::max;
8 using std::min;
9 
10 namespace {
11 
12 #ifdef __ARM_NEON__
13 
14 bool isNeon4x4p0s0Eligible(
15  int inputH,
16  int inputW,
17  int outputH,
18  int outputW,
19  int kH,
20  int kW,
21  int strideH,
22  int strideW,
23  int padT,
24  int padL,
25  int padB,
26  int padR,
27  int dilationH,
28  int dilationW,
29  const float* input,
30  float* output) {
31  // Use this kernel only if:
32  // Kernel width is 4x4
33  // Kernel stride is 4x4
34  // Padding is 0
35  // Dilation is 1
36  // Output width and height are even divisors of input width
37  // Input width and height are divisible by 4 (should be implied by
38  // all of the above, but just check again)
39  // Input and output pointers are aligned by float32x4_t
40 
41  bool kernelOk = (kH == 4) && (kW == 4);
42  bool strideOk = (strideH == 4) && (strideW == 4);
43  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
44  bool dilationOk = (dilationH == 1) && (dilationW == 1);
45 
46  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
47  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
48  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
49  isPointerAligned(output, sizeof(float32x4_t));
50 
51  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
52  alignOk;
53 }
54 
55 // Vectorizes 4x4p0s0 averge pooling for ARM NEON
56 void avgPoolNeon4x4p0s0Plane(
57  int inputH,
58  int inputW,
59  const float* input,
60  float* output) {
61  constexpr int kKernelHeight = 4;
62  constexpr int kKernelWidth = 4;
63  constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));
64 
65  // Handle portion that can be unrolled by 4
66  constexpr int kUnroll = 4;
67  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
68  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
69 
70  if (inputW % kLoadCols == 0) {
71  //
72  // Manually unroll by 4 (kUnroll)
73  //
74 
75  for (int h = 0; h < inputH; h += kKernelHeight) {
76  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
77  const float* curInput = input + h * inputW;
78 
79  for (int w = 0; w < inputW; w += kLoadCols) {
80  float32x4_t out = {};
81 
82  {
83  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
84  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
85  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
86  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
87  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
88  out = vsetq_lane_f32(v0, out, 0);
89  }
90  curInput += kLoadSizeFloat;
91 
92  {
93  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
94  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
95  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
96  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
97  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
98  out = vsetq_lane_f32(v0, out, 1);
99  }
100  curInput += kLoadSizeFloat;
101 
102  {
103  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
104  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
105  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
106  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
107  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
108  out = vsetq_lane_f32(v0, out, 2);
109  }
110  curInput += kLoadSizeFloat;
111 
112  {
113  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
114  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
115  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
116  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
117  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
118  out = vsetq_lane_f32(v0, out, 3);
119  }
120  curInput += kLoadSizeFloat;
121 
122  out = vmulq_f32(out, vdupq_n_f32(kDiv));
123  vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
124  }
125  }
126  } else {
127  //
128  // Not unrolled
129  //
130 
131  for (int h = 0; h < inputH; h += kKernelHeight) {
132  const float* inputRow = input + h * inputW;
133  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
134 
135  for (int w = 0; w < inputW; w += kKernelWidth) {
136  const float* curInput = inputRow + w;
137 
138  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
139  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
140  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
141  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
142  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
143  outputRow[w / kKernelWidth] = v0;
144  }
145  }
146  }
147 }
148 
149 void runNeonAveragePool4x4p0s0NCHW(
150  int N,
151  int C,
152  int inputH,
153  int inputW,
154  const float* input,
155  float* output) {
156  // We only have the 4x4p0s0 implementation at present, which is
157  // checked at a higher level
158  int outputH = inputH / 4;
159  int outputW = inputW / 4;
160 
161  for (int n = 0; n < N; ++n) {
162  for (int c = 0; c < C; ++c) {
163  const float* curInput = input + (n * C + c) * inputH * inputW;
164  float* curOutput = output + (n * C + c) * outputH * outputW;
165 
166  avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
167  }
168  }
169 }
170 
171 bool isNeon2x2p0s0Eligible(
172  int inputH,
173  int inputW,
174  int outputH,
175  int outputW,
176  int kH,
177  int kW,
178  int strideH,
179  int strideW,
180  int padT,
181  int padL,
182  int padB,
183  int padR,
184  int dilationH,
185  int dilationW,
186  const float* input,
187  float* output) {
188  // Use this kernel only if:
189  // Kernel width is 2x2
190  // Kernel stride is 2x2
191  // Padding is 0
192  // Dilation is 1
193  // Output width and height are even divisors of input width
194  // Input width and height are divisible by 4 (should be implied by
195  // all of the above, but just check again)
196  // Input and output pointers are aligned by float32x4_t
197 
198  bool kernelOk = (kH == 2) && (kW == 2);
199  bool strideOk = (strideH == 2) && (strideW == 2);
200  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
201  bool dilationOk = (dilationH == 1) && (dilationW == 1);
202 
203  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
204  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
205  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
206  isPointerAligned(output, sizeof(float32x4_t));
207 
208  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
209  alignOk;
210 }
211 
212 // Vectorizes 2x2p0s0 averge pooling for ARM NEON
213 void maxPoolNeon2x2p0s0Plane(
214  int inputH,
215  int inputW,
216  const float* input,
217  float* output) {
218  constexpr int kKernelHeight = 2;
219  constexpr int kKernelWidth = 2;
220 
221  // Handle portion that can be unrolled by 4
222  constexpr int kUnroll = 4;
223  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
224  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
225 
226  if (inputW % kLoadCols == 0) {
227  for (int h = 0; h < inputH; h += kKernelHeight) {
228  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
229  const float* curInput = input + h * inputW;
230 
231  for (int w = 0; w < inputW; w += kLoadCols) {
232  float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
233  {
234  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
235  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
236  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
237  hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
238  }
239  curInput += kLoadSizeFloat;
240  {
241  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
242  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
243  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
244  hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
245  }
246  curInput += kLoadSizeFloat;
247  {
248  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
249  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
250  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
251  hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
252  }
253  curInput += kLoadSizeFloat;
254  {
255  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
256  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
257  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
258  hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
259  }
260  curInput += kLoadSizeFloat;
261 
262  float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
263  float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
264  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
265  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
266  }
267  }
268  } else {
269  // Not unrolled
270  for (int h = 0; h < inputH; h += kKernelHeight) {
271  const float* inputRow = input + h * inputW;
272  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
273 
274  for (int w = 0; w < inputW; w += kKernelWidth * 2) {
275  const float* curInput = inputRow + w;
276  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
277  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
278  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
279  float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
280  vst1_f32(&outputRow[w / kKernelWidth], hmax);
281  }
282  }
283  }
284 }
285 
286 void runNeonMaxPool2x2p0s0NCHW(
287  int N,
288  int C,
289  int inputH,
290  int inputW,
291  const float* input,
292  float* output) {
293  // We only have the 2x2p0s0 implementation at present, which is
294  // checked at a higher level
295  int outputH = inputH / 2;
296  int outputW = inputW / 2;
297 
298  for (int n = 0; n < N; ++n) {
299  for (int c = 0; c < C; ++c) {
300  const float* curInput = input + (n * C + c) * inputH * inputW;
301  float* curOutput = output + (n * C + c) * outputH * outputW;
302  maxPoolNeon2x2p0s0Plane(inputH, inputW, curInput, curOutput);
303  }
304  }
305 }
306 #endif // __ARM_NEON__
307 
308 } // namespace
309 
310 template <typename T>
311 class AveragePool {
312  public:
313  static float initialize() {
314  return 0.0;
315  }
316 
317  static void process(
318  const int x_col,
319  const int y_col,
320  ConstEigenMatrixMap<float>& x_mat,
321  EigenMatrixMap<float>& y_mat) {
322  y_mat.col(y_col) += x_mat.col(x_col);
323  }
324 
325  static void process(const T& x_data, T& y_data) {
326  y_data += x_data;
327  }
328 
329  static void finalize(const int size, T& y_data) {
330  y_data /= size;
331  }
332 
333  static void
334  finalize(const int size, const int col, EigenMatrixMap<float>& y_mat) {
335  y_mat.col(col) /= size;
336  }
337 
338  static bool runSpecialized(
339  int N,
340  int C,
341  int inputH,
342  int inputW,
343  int outputH,
344  int outputW,
345  int kH,
346  int kW,
347  int strideH,
348  int strideW,
349  int padT,
350  int padL,
351  int padB,
352  int padR,
353  int dilationH,
354  int dilationW,
355  const float* input,
356  float* output) {
357 #ifdef __ARM_NEON__
358  if (isNeon4x4p0s0Eligible(
359  inputH,
360  inputW,
361  outputH,
362  outputW,
363  kH,
364  kW,
365  strideH,
366  strideW,
367  padT,
368  padL,
369  padB,
370  padR,
371  dilationH,
372  dilationW,
373  input,
374  output)) {
375  runNeonAveragePool4x4p0s0NCHW(N, C, inputH, inputW, input, output);
376  return true;
377  }
378 #else
379  (void)N;
380  (void)C;
381  (void)inputH;
382  (void)inputW;
383  (void)outputH;
384  (void)outputW;
385  (void)kH;
386  (void)kW;
387  (void)strideH;
388  (void)strideW;
389  (void)padT;
390  (void)padL;
391  (void)padB;
392  (void)padR;
393  (void)dilationH;
394  (void)dilationW;
395  (void)input;
396  (void)output;
397 #endif
398  return false;
399  }
400 };
401 
402 template <typename T>
403 class MaxPool {
404  public:
405  static float initialize() {
406  return std::numeric_limits<float>::lowest();
407  }
408 
409  static void process(
410  const int x_col,
411  const int y_col,
412  ConstEigenMatrixMap<float>& x_mat,
413  EigenMatrixMap<float>& y_mat) {
414  y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
415  }
416 
417  static void process(const T& x_data, T& y_data) {
418  if (x_data > y_data) {
419  y_data = x_data;
420  }
421  }
422 
423  static void finalize(const int /*size*/, T& /*y_data*/) {}
424 
425  static void finalize(
426  const int /*size*/,
427  const int /*col*/,
428  EigenMatrixMap<float>& /*y_mat*/) {}
429 
430  static bool runSpecialized(
431  int N,
432  int C,
433  int inputH,
434  int inputW,
435  int outputH,
436  int outputW,
437  int kH,
438  int kW,
439  int strideH,
440  int strideW,
441  int padT,
442  int padL,
443  int padB,
444  int padR,
445  int dilationH,
446  int dilationW,
447  const float* input,
448  float* output) {
449 #ifdef __ARM_NEON__
450  if (isNeon2x2p0s0Eligible(
451  inputH,
452  inputW,
453  outputH,
454  outputW,
455  kH,
456  kW,
457  strideH,
458  strideW,
459  padT,
460  padL,
461  padB,
462  padR,
463  dilationH,
464  dilationW,
465  input,
466  output)) {
467  runNeonMaxPool2x2p0s0NCHW(N, C, inputH, inputW, input, output);
468  return true;
469  }
470 #else
471  (void)N;
472  (void)C;
473  (void)inputH;
474  (void)inputW;
475  (void)outputH;
476  (void)outputW;
477  (void)kH;
478  (void)kW;
479  (void)strideH;
480  (void)strideW;
481  (void)padT;
482  (void)padL;
483  (void)padB;
484  (void)padR;
485  (void)dilationH;
486  (void)dilationW;
487  (void)input;
488  (void)output;
489 #endif
490  return false;
491  }
492 };
493 
494 template <typename T, class Context, typename PoolType>
496  auto& X = Input(0);
497  auto* Y = Output(0);
498  ConvPoolOpBase<Context>::SetOutputSize(X, Y, X.dim32(1));
499 
500  const float* Xdata = X.template data<float>();
501  float* Ydata = Y->template mutable_data<float>();
502  // The main loop
503  int channels = X.dim32(1);
504  int height = X.dim32(2);
505  int width = kernel_.size() > 1 ? X.dim32(3) : 1;
506  int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
507  int pooled_height = Y->dim32(2);
508  int pooled_width = kernel_.size() > 1 ? Y->dim32(3) : 1;
509  int pooled_depth = kernel_.size() > 2 ? Y->dim32(4) : 1;
510 
511  // We specialize certain variants on ARM for vectorization
512  if (kernel_.size() == 2 &&
513  PoolType::runSpecialized(
514  X.dim32(0),
515  X.dim32(1),
516  X.dim32(2),
517  X.dim32(3),
518  Y->dim32(2),
519  Y->dim32(3),
520  kernel_h(),
521  kernel_w(),
522  stride_h(),
523  stride_w(),
524  pad_t(),
525  pad_l(),
526  pad_b(),
527  pad_r(),
528  dilation_h(),
529  dilation_w(),
530  Xdata,
531  Ydata)) {
532  return true;
533  }
534 
535  switch (kernel_.size()) {
536  case 1:
537  for (int n = 0; n < X.dim32(0); ++n) {
538  for (int c = 0; c < channels; ++c) {
539  for (int ph = 0; ph < pooled_height; ++ph) {
540  int hstart = ph * stride_h() - pad_t();
541  int hend = min(hstart + kernel_h(), height);
542  hstart = max(hstart, 0);
543  T Yh = PoolType::initialize();
544  for (int h = hstart; h < hend; ++h) {
545  PoolType::process(Xdata[h], Yh);
546  }
547  PoolType::finalize(hend - hstart, Yh);
548  Ydata[ph] = Yh;
549  }
550  // Do offset.
551  Xdata += height;
552  Ydata += pooled_height;
553  }
554  }
555  break;
556  case 2:
557  for (int n = 0; n < X.dim32(0); ++n) {
558  for (int c = 0; c < channels; ++c) {
559  for (int ph = 0; ph < pooled_height; ++ph) {
560  int hstart = ph * stride_h() - pad_t();
561  int hend = min(hstart + kernel_h(), height);
562  hstart = max(hstart, 0);
563  for (int pw = 0; pw < pooled_width; ++pw) {
564  int wstart = pw * stride_w() - pad_l();
565  int wend = min(wstart + kernel_w(), width);
566  wstart = max(wstart, 0);
567  const int pool_index = ph * pooled_width + pw;
568  T Yh = PoolType::initialize();
569  for (int h = hstart; h < hend; ++h) {
570  for (int w = wstart; w < wend; ++w) {
571  const int input_index = h * width + w;
572  PoolType::process(Xdata[input_index], Yh);
573  }
574  }
575  PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
576  Ydata[pool_index] = Yh;
577  }
578  }
579  // Do offset.
580  Xdata += height * width;
581  Ydata += pooled_height * pooled_width;
582  }
583  }
584  break;
585  case 3:
586  for (int n = 0; n < X.dim32(0); ++n) {
587  for (int c = 0; c < channels; ++c) {
588  for (int ph = 0; ph < pooled_height; ++ph) {
589  int hstart = ph * stride_h() - pad_t();
590  int hend = min(hstart + kernel_h(), height);
591  hstart = max(hstart, 0);
592  for (int pw = 0; pw < pooled_width; ++pw) {
593  int wstart = pw * stride_w() - pad_l();
594  int wend = min(wstart + kernel_w(), width);
595  wstart = max(wstart, 0);
596  for (int pd = 0; pd < pooled_depth; ++pd) {
597  int dstart = pd * stride_[2] - pads_[2];
598  int dend = min(dstart + kernel_[2], depth);
599  dstart = max(dstart, 0);
600  const int pool_index =
601  ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
602  T Yh = PoolType::initialize();
603  for (int h = hstart; h < hend; ++h) {
604  for (int w = wstart; w < wend; ++w) {
605  for (int d = dstart; d < dend; ++d) {
606  const int input_index = h * width * depth + w * depth + d;
607  PoolType::process(Xdata[input_index], Yh);
608  }
609  }
610  }
611  PoolType::finalize(
612  (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
613  Ydata[pool_index] = Yh;
614  }
615  }
616  }
617  // Do offset.
618  Xdata += height * width * depth;
619  Ydata += pooled_height * pooled_width * pooled_depth;
620  }
621  }
622  break;
623  default:
624  CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
625  return false;
626  }
627  return true;
628 }
629 
630 template <typename T, class Context, typename PoolType>
632  auto& X = Input(0);
633  auto* Y = Output(0);
634  int height = X.dim32(1);
635  int width = kernel_.size() > 1 ? X.dim32(2) : 1;
636  int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
637  int channels = X.dim32(X.ndim() - 1);
639 
640  EigenMatrixMap<float> Ymat(
641  Y->template mutable_data<float>(), channels, Y->size() / channels);
642  ConstEigenMatrixMap<float> Xmat(
643  X.template data<float>(), channels, X.size() / channels);
644  int pooled_height = Y->dim32(1);
645  int pooled_width = kernel_.size() > 1 ? Y->dim32(2) : 1;
646  int pooled_depth = kernel_.size() > 2 ? Y->dim32(3) : 1;
647  // The main loop
648  switch (kernel_.size()) {
649  case 1:
650  for (int n = 0; n < X.dim32(0); ++n) {
651  for (int ph = 0; ph < pooled_height; ++ph) {
652  int hstart = ph * stride_h() - pad_t();
653  int hend = min(hstart + kernel_h(), height);
654  hstart = max(hstart, 0);
655  const int y_col = n * pooled_height + ph;
656  Ymat.col(y_col).setConstant(PoolType::initialize());
657  for (int h = hstart; h < hend; ++h) {
658  const int x_col = n * height + h;
659  PoolType::process(x_col, y_col, Xmat, Ymat);
660  }
661  PoolType::finalize((hend - hstart), y_col, Ymat);
662  }
663  }
664  break;
665  case 2:
666  for (int n = 0; n < X.dim32(0); ++n) {
667  for (int ph = 0; ph < pooled_height; ++ph) {
668  int hstart = ph * stride_h() - pad_t();
669  int hend = min(hstart + kernel_h(), height);
670  hstart = max(hstart, 0);
671  for (int pw = 0; pw < pooled_width; ++pw) {
672  int wstart = pw * stride_w() - pad_l();
673  int wend = min(wstart + kernel_w(), width);
674  wstart = max(wstart, 0);
675  const int y_col = (n * pooled_height + ph) * pooled_width + pw;
676  Ymat.col(y_col).setConstant(PoolType::initialize());
677  for (int h = hstart; h < hend; ++h) {
678  for (int w = wstart; w < wend; ++w) {
679  const int x_col = (n * height + h) * width + w;
680  PoolType::process(x_col, y_col, Xmat, Ymat);
681  }
682  }
683  PoolType::finalize((hend - hstart) * (wend - wstart), y_col, Ymat);
684  }
685  }
686  }
687  break;
688  case 3:
689  for (int n = 0; n < X.dim32(0); ++n) {
690  for (int ph = 0; ph < pooled_height; ++ph) {
691  int hstart = ph * stride_h() - pad_t();
692  int hend = min(hstart + kernel_h(), height);
693  hstart = max(hstart, 0);
694  for (int pw = 0; pw < pooled_width; ++pw) {
695  int wstart = pw * stride_w() - pad_l();
696  int wend = min(wstart + kernel_w(), width);
697  wstart = max(wstart, 0);
698  for (int pd = 0; pd < pooled_depth; ++pd) {
699  int dstart = pd * stride_[2] - pads_[2];
700  int dend = min(dstart + kernel_[2], depth);
701  dstart = max(dstart, 0);
702  const int y_col = ((n * pooled_height + ph) * pooled_width + pw) *
703  pooled_depth +
704  pd;
705  Ymat.col(y_col).setConstant(PoolType::initialize());
706  for (int h = hstart; h < hend; ++h) {
707  for (int w = wstart; w < wend; ++w) {
708  for (int d = dstart; d < dend; ++d) {
709  const int x_col =
710  ((n * height + h) * width + w) * depth + d;
711  PoolType::process(x_col, y_col, Xmat, Ymat);
712  }
713  }
714  }
715  PoolType::finalize(
716  (hend - hstart) * (wend - wstart) * (dend - dstart),
717  y_col,
718  Ymat);
719  }
720  }
721  }
722  }
723  break;
724  default:
725  CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
726  return false;
727  }
728  return true;
729 }
730 const char* kAveragePoolDoc = R"DOC(
731 consumes an input blob X and applies average pooling across the
732 the blob according to kernel sizes, stride sizes, and pad lengths defined by the
733 ConvPoolOpBase operator. Average pooling consisting of averaging all values of a
734 subset of the input tensor according to the kernel size and downsampling the
735 data into the output blob Y for further processing.
736 )DOC";
737 
738 const char* kMaxPoolDoc = R"DOC(
739 consumes an input blob X and applies max pooling across the
740 the blob according to kernel sizes, stride sizes, and pad lengths defined by the
741 ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a
742 subset of the input tensor according to the kernel size and downsampling the
743 data into the output blob Y for further processing.
744 )DOC";
745 
746 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
747  return [=](OpSchema& schema) {
748  string doc = "AveragePool{dim} {pool_doc}";
749  ReplaceAll(doc, "{dim}", dim);
750  ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc);
751  schema.SetDoc(doc);
752  schema.Input(
753  0,
754  "X",
755  "Input data tensor from the previous operator; dimensions depend on "
756  "whether the NCHW or NHWC operators are being used. For example, in "
757  "the former, the input has size (N x C x H x W), where N is the batch "
758  "size, C is the number of channels, and H and W are the height and the "
759  "width of the data. The corresponding permutation of dimensions is "
760  "used in the latter case.");
761  schema.Output(
762  0,
763  "Y",
764  "Output data tensor from average pooling across the input "
765  "tensor. Dimensions will vary based on various kernel, stride, and pad "
766  "sizes.");
767  };
768 }
769 
770 std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
771  return [=](OpSchema& schema) {
772  string doc = "MaxPool{dim} {pool_doc}";
773  ReplaceAll(doc, "{dim}", dim);
774  ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc);
775  schema.SetDoc(doc);
776  schema.Input(
777  0,
778  "X",
779  "Input data tensor from the previous operator; dimensions depend on "
780  "whether the NCHW or NHWC operators are being used. For example, in "
781  "the former, the input has size (N x C x H x W), where N is the batch "
782  "size, C is the number of channels, and H and W are the height and the "
783  "width of the data. The corresponding permutation of dimensions is "
784  "used in the latter case.");
785  schema.Output(
786  0,
787  "Y",
788  "Output data tensor from max pooling across the input "
789  "tensor. Dimensions will vary based on various kernel, stride, and pad "
790  "sizes.");
791  };
792 }
793 REGISTER_CPU_OPERATOR(
794  AveragePool,
795  PoolOp<float, CPUContext, AveragePool<float>>);
796 
797 OPERATOR_SCHEMA(AveragePool)
798  .NumInputs(1)
799  .NumOutputs(1)
801  .FillUsing(AveragePoolDocGenerator(""))
802  .InheritOnnxSchema("AveragePool");
803 
804 REGISTER_CPU_OPERATOR(
805  AveragePool1D,
806  PoolOp<float, CPUContext, AveragePool<float>>);
807 
808 OPERATOR_SCHEMA(AveragePool1D)
809  .NumInputs(1)
810  .NumOutputs(1)
812  .FillUsing(AveragePoolDocGenerator("1D"))
813  .InheritOnnxSchema("AveragePool");
814 
815 REGISTER_CPU_OPERATOR(
816  AveragePool2D,
817  PoolOp<float, CPUContext, AveragePool<float>>);
818 
819 OPERATOR_SCHEMA(AveragePool2D)
820  .NumInputs(1)
821  .NumOutputs(1)
823  .FillUsing(AveragePoolDocGenerator("2D"))
824  .InheritOnnxSchema("AveragePool");
825 
826 REGISTER_CPU_OPERATOR(
827  AveragePool3D,
828  PoolOp<float, CPUContext, AveragePool<float>>);
829 
830 OPERATOR_SCHEMA(AveragePool3D)
831  .NumInputs(1)
832  .NumOutputs(1)
834  .FillUsing(AveragePoolDocGenerator("3D"))
835  .InheritOnnxSchema("AveragePool");
836 
837 REGISTER_CPU_OPERATOR(MaxPool, PoolOp<float, CPUContext, MaxPool<float>>);
838 
839 OPERATOR_SCHEMA(MaxPool)
840  .NumInputs(1)
841  .NumOutputs(1)
843  .FillUsing(MaxPoolDocGenerator(""))
844  .InheritOnnxSchema("MaxPool");
845 
846 REGISTER_CPU_OPERATOR(MaxPool1D, PoolOp<float, CPUContext, MaxPool<float>>);
847 
848 OPERATOR_SCHEMA(MaxPool1D)
849  .NumInputs(1)
850  .NumOutputs(1)
852  .FillUsing(MaxPoolDocGenerator("1D"))
853  .InheritOnnxSchema("MaxPool");
854 
855 REGISTER_CPU_OPERATOR(MaxPool2D, PoolOp<float, CPUContext, MaxPool<float>>);
856 
857 OPERATOR_SCHEMA(MaxPool2D)
858  .NumInputs(1)
859  .NumOutputs(1)
861  .FillUsing(MaxPoolDocGenerator("2D"))
862  .InheritOnnxSchema("MaxPool");
863 
864 REGISTER_CPU_OPERATOR(MaxPool3D, PoolOp<float, CPUContext, MaxPool<float>>);
865 
866 OPERATOR_SCHEMA(MaxPool3D)
867  .NumInputs(1)
868  .NumOutputs(1)
870  .FillUsing(MaxPoolDocGenerator("3D"))
871  .InheritOnnxSchema("MaxPool");
872 } // namespace caffe2
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...