Caffe2 - C++ API
A deep learning, cross platform ML framework
roi_align_op.cc
1 #include "roi_align_op.h"
2 
3 #include "caffe2/utils/eigen_utils.h"
4 #include "caffe2/utils/math.h"
5 
6 #ifdef CAFFE2_USE_MKL
7 #include "caffe2/mkl/operators/operator_fallback_mkl.h"
8 #endif // CAFFE2_USE_MKL
9 
10 namespace caffe2 {
11 namespace {
12 
13 template <typename T>
14 struct PreCalc {
15  int pos1;
16  int pos2;
17  int pos3;
18  int pos4;
19  T w1;
20  T w2;
21  T w3;
22  T w4;
23 };
24 
25 template <typename T>
26 void pre_calc_for_bilinear_interpolate(
27  const int height,
28  const int width,
29  const int pooled_height,
30  const int pooled_width,
31  const int iy_upper,
32  const int ix_upper,
33  T roi_start_h,
34  T roi_start_w,
35  T bin_size_h,
36  T bin_size_w,
37  int roi_bin_grid_h,
38  int roi_bin_grid_w,
39  std::vector<PreCalc<T>>& pre_calc) {
40  int pre_calc_index = 0;
41  for (int ph = 0; ph < pooled_height; ph++) {
42  for (int pw = 0; pw < pooled_width; pw++) {
43  for (int iy = 0; iy < iy_upper; iy++) {
44  const T yy = roi_start_h + ph * bin_size_h +
45  static_cast<T>(iy + .5f) * bin_size_h /
46  static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
47  for (int ix = 0; ix < ix_upper; ix++) {
48  const T xx = roi_start_w + pw * bin_size_w +
49  static_cast<T>(ix + .5f) * bin_size_w /
50  static_cast<T>(roi_bin_grid_w);
51 
52  T x = xx;
53  T y = yy;
54  // deal with: inverse elements are out of feature map boundary
55  if (y < -1.0 || y > height || x < -1.0 || x > width) {
56  // empty
57  PreCalc<T> pc;
58  pc.pos1 = 0;
59  pc.pos2 = 0;
60  pc.pos3 = 0;
61  pc.pos4 = 0;
62  pc.w1 = 0;
63  pc.w2 = 0;
64  pc.w3 = 0;
65  pc.w4 = 0;
66  pre_calc[pre_calc_index] = pc;
67  pre_calc_index += 1;
68  continue;
69  }
70 
71  if (y <= 0) {
72  y = 0;
73  }
74  if (x <= 0) {
75  x = 0;
76  }
77 
78  int y_low = (int)y;
79  int x_low = (int)x;
80  int y_high;
81  int x_high;
82 
83  if (y_low >= height - 1) {
84  y_high = y_low = height - 1;
85  y = (T)y_low;
86  } else {
87  y_high = y_low + 1;
88  }
89 
90  if (x_low >= width - 1) {
91  x_high = x_low = width - 1;
92  x = (T)x_low;
93  } else {
94  x_high = x_low + 1;
95  }
96 
97  T ly = y - y_low;
98  T lx = x - x_low;
99  T hy = 1. - ly, hx = 1. - lx;
100  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
101 
102  // save weights and indeces
103  PreCalc<T> pc;
104  pc.pos1 = y_low * width + x_low;
105  pc.pos2 = y_low * width + x_high;
106  pc.pos3 = y_high * width + x_low;
107  pc.pos4 = y_high * width + x_high;
108  pc.w1 = w1;
109  pc.w2 = w2;
110  pc.w3 = w3;
111  pc.w4 = w4;
112  pre_calc[pre_calc_index] = pc;
113 
114  pre_calc_index += 1;
115  }
116  }
117  }
118  }
119 }
120 
121 template <typename T>
122 void ROIAlignForward(
123  const int nthreads,
124  const T* bottom_data,
125  const T& spatial_scale,
126  const int channels,
127  const int height,
128  const int width,
129  const int pooled_height,
130  const int pooled_width,
131  const int sampling_ratio,
132  const T* bottom_rois,
133  int roi_cols,
134  T* top_data,
135  StorageOrder order) {
136  DCHECK(roi_cols == 4 || roi_cols == 5);
137 
138  int n_rois = nthreads / channels / pooled_width / pooled_height;
139  // (n, c, ph, pw) is an element in the pooled output
140  // can be parallelized using omp
141  // #pragma omp parallel for num_threads(32)
142  for (int n = 0; n < n_rois; n++) {
143  int index_n = n * channels * pooled_width * pooled_height;
144 
145  // roi could have 4 or 5 columns
146  const T* offset_bottom_rois = bottom_rois + n * roi_cols;
147  int roi_batch_ind = 0;
148  if (roi_cols == 5) {
149  roi_batch_ind = offset_bottom_rois[0];
150  offset_bottom_rois++;
151  }
152 
153  // Do not using rounding; this implementation detail is critical
154  T roi_start_w = offset_bottom_rois[0] * spatial_scale;
155  T roi_start_h = offset_bottom_rois[1] * spatial_scale;
156  T roi_end_w = offset_bottom_rois[2] * spatial_scale;
157  T roi_end_h = offset_bottom_rois[3] * spatial_scale;
158  // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
159  // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
160  // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
161  // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
162 
163  // Force malformed ROIs to be 1x1
164  T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
165  T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
166  T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
167  T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
168 
169  // We use roi_bin_grid to sample the grid and mimic integral
170  int roi_bin_grid_h = (sampling_ratio > 0)
171  ? sampling_ratio
172  : ceil(roi_height / pooled_height); // e.g., = 2
173  int roi_bin_grid_w =
174  (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
175 
176  // We do average (integral) pooling inside a bin
177  const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
178 
179  // we want to precalculate indeces and weights shared by all chanels,
180  // this is the key point of optimiation
181  std::vector<PreCalc<T>> pre_calc(
182  roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
183  pre_calc_for_bilinear_interpolate(
184  height,
185  width,
186  pooled_height,
187  pooled_width,
188  roi_bin_grid_h,
189  roi_bin_grid_w,
190  roi_start_h,
191  roi_start_w,
192  bin_size_h,
193  bin_size_w,
194  roi_bin_grid_h,
195  roi_bin_grid_w,
196  pre_calc);
197 
198  if (order == StorageOrder::NCHW) {
199  for (int c = 0; c < channels; c++) {
200  int index_n_c = index_n + c * pooled_width * pooled_height;
201  const T* offset_bottom_data =
202  bottom_data + (roi_batch_ind * channels + c) * height * width;
203  int pre_calc_index = 0;
204 
205  for (int ph = 0; ph < pooled_height; ph++) {
206  for (int pw = 0; pw < pooled_width; pw++) {
207  int index = index_n_c + ph * pooled_width + pw;
208 
209  T output_val = 0.;
210  for (int iy = 0; iy < roi_bin_grid_h; iy++) {
211  for (int ix = 0; ix < roi_bin_grid_w; ix++) {
212  PreCalc<T> pc = pre_calc[pre_calc_index];
213  output_val += pc.w1 * offset_bottom_data[pc.pos1] +
214  pc.w2 * offset_bottom_data[pc.pos2] +
215  pc.w3 * offset_bottom_data[pc.pos3] +
216  pc.w4 * offset_bottom_data[pc.pos4];
217 
218  pre_calc_index += 1;
219  }
220  }
221  output_val /= count;
222 
223  top_data[index] = output_val;
224  } // for pw
225  } // for ph
226  } // for c
227  } // if nchw
228 
229  if (order == StorageOrder::NHWC) {
230  const T* offset_bottom_data =
231  bottom_data + roi_batch_ind * channels * height * width;
232  int pre_calc_index = 0;
233 
234  for (int ph = 0; ph < pooled_height; ph++) {
235  for (int pw = 0; pw < pooled_width; pw++) {
236  EVecXf output_vals = EVecXf::Zero(channels);
237 
238  for (int iy = 0; iy < roi_bin_grid_h; iy++) {
239  for (int ix = 0; ix < roi_bin_grid_w; ix++) {
240  PreCalc<T> pc = pre_calc[pre_calc_index];
241 
242  ConstEigenVectorMap<T> data_1(
243  offset_bottom_data + channels * pc.pos1, channels);
244  ConstEigenVectorMap<T> data_2(
245  offset_bottom_data + channels * pc.pos2, channels);
246  ConstEigenVectorMap<T> data_3(
247  offset_bottom_data + channels * pc.pos3, channels);
248  ConstEigenVectorMap<T> data_4(
249  offset_bottom_data + channels * pc.pos4, channels);
250 
251  output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
252  pc.w4 * data_4;
253 
254  pre_calc_index += 1;
255  }
256  }
257  output_vals /= count;
258 
259  int index_nhw = index_n + (ph * pooled_width + pw) * channels;
260  std::memcpy(
261  top_data + index_nhw, output_vals.data(), channels * sizeof(T));
262  } // for pw
263  } // for ph
264  } // if nhwc
265 
266  } // for n
267 }
268 
269 } // namespace
270 
271 template <>
272 bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
273  auto& X = Input(0); // Input data to pool, NCHW
274  auto& R = Input(1); // RoIs
275  auto* Y = Output(0); // RoI pooled data
276 
277  if (R.size() == 0) {
278  // Handle empty rois
279  if (order_ == StorageOrder::NCHW) {
280  Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
281  } else if (order_ == StorageOrder::NHWC) {
282  Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
283  }
284  // The following mutable_data calls are needed to allocate the tensors
285  Y->mutable_data<float>();
286  return true;
287  }
288 
289  CAFFE_ENFORCE_EQ(R.ndim(), 2);
290  // if R has 5 columns, the first column is the index, otherwise 0
291  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
292 
293  assert(sampling_ratio_ >= 0);
294 
295  if (order_ == StorageOrder::NCHW) {
296  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
297  int output_size = Y->size();
298  ROIAlignForward<float>(
299  output_size,
300  X.data<float>(),
301  spatial_scale_,
302  X.dim32(1),
303  X.dim32(2),
304  X.dim32(3),
305  pooled_height_,
306  pooled_width_,
307  sampling_ratio_,
308  R.data<float>(),
309  R.dim32(1),
310  Y->mutable_data<float>(),
311  order_);
312  } else if (order_ == StorageOrder::NHWC) {
313  Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
314  int output_size = Y->size();
315  ROIAlignForward<float>(
316  output_size,
317  X.data<float>(),
318  spatial_scale_,
319  X.dim32(3),
320  X.dim32(1),
321  X.dim32(2),
322  pooled_height_,
323  pooled_width_,
324  sampling_ratio_,
325  R.data<float>(),
326  R.dim32(1),
327  Y->mutable_data<float>(),
328  order_);
329  }
330 
331  return true;
332 }
333 
334 REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
335 
336 #ifdef CAFFE2_HAS_MKL_DNN
337 REGISTER_MKL_OPERATOR(
338  RoIAlign,
339  mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>);
340 #endif // CAFFE2_HAS_MKL_DNN
341 
342 // Input: X, rois; Output: Y
343 OPERATOR_SCHEMA(RoIAlign)
344  .NumInputs(2)
345  .NumOutputs(1)
346  .SetDoc(R"DOC(
347 Region of Interest (RoI) align operation as used in Mask R-CNN.
348 )DOC")
349  .Arg(
350  "spatial_scale",
351  "(float) default 1.0; Spatial scale of the input feature map X "
352  "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
353  "w.r.t. the input image.")
354  .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
355  .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
356  .Arg(
357  "sampling_ratio",
358  "(int) default -1; number of sampling points in the interpolation grid "
359  "used to compute the output value of each pooled output bin. If > 0, "
360  "then exactly sampling_ratio x sampling_ratio grid points are used. If "
361  "<= 0, then an adaptive number of grid points are used (computed as "
362  "ceil(roi_width / pooled_w), and likewise for height).")
363  .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
364  .Input(
365  1,
366  "RoIs",
367  "2D input of shape (R, 4 or 5) specifying R RoIs "
368  "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
369  "coordinates are in the coordinate system of the input image. For "
370  "inputs corresponding to a single image, batch index can be excluded "
371  "to have just 4 columns.")
372  .Output(
373  0,
374  "Y",
375  "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
376  "is a pooled feature map cooresponding to the r-th RoI.");
377 
378 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...