1 #include "roi_align_op.h" 3 #include "caffe2/utils/eigen_utils.h" 4 #include "caffe2/utils/math.h" 7 #include "caffe2/mkl/operators/operator_fallback_mkl.h" 8 #endif // CAFFE2_USE_MKL 26 void pre_calc_for_bilinear_interpolate(
29 const int pooled_height,
30 const int pooled_width,
39 std::vector<PreCalc<T>>& pre_calc) {
40 int pre_calc_index = 0;
41 for (
int ph = 0; ph < pooled_height; ph++) {
42 for (
int pw = 0; pw < pooled_width; pw++) {
43 for (
int iy = 0; iy < iy_upper; iy++) {
44 const T yy = roi_start_h + ph * bin_size_h +
45 static_cast<T
>(iy + .5f) * bin_size_h /
46 static_cast<T>(roi_bin_grid_h);
47 for (
int ix = 0; ix < ix_upper; ix++) {
48 const T xx = roi_start_w + pw * bin_size_w +
49 static_cast<T
>(ix + .5f) * bin_size_w /
50 static_cast<T>(roi_bin_grid_w);
55 if (y < -1.0 || y > height || x < -1.0 || x > width) {
66 pre_calc[pre_calc_index] = pc;
83 if (y_low >= height - 1) {
84 y_high = y_low = height - 1;
90 if (x_low >= width - 1) {
91 x_high = x_low = width - 1;
99 T hy = 1. - ly, hx = 1. - lx;
100 T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
104 pc.pos1 = y_low * width + x_low;
105 pc.pos2 = y_low * width + x_high;
106 pc.pos3 = y_high * width + x_low;
107 pc.pos4 = y_high * width + x_high;
112 pre_calc[pre_calc_index] = pc;
121 template <
typename T>
122 void ROIAlignForward(
124 const T* bottom_data,
125 const T& spatial_scale,
129 const int pooled_height,
130 const int pooled_width,
131 const int sampling_ratio,
132 const T* bottom_rois,
135 StorageOrder order) {
136 DCHECK(roi_cols == 4 || roi_cols == 5);
138 int n_rois = nthreads / channels / pooled_width / pooled_height;
142 for (
int n = 0; n < n_rois; n++) {
143 int index_n = n * channels * pooled_width * pooled_height;
146 const T* offset_bottom_rois = bottom_rois + n * roi_cols;
147 int roi_batch_ind = 0;
149 roi_batch_ind = offset_bottom_rois[0];
150 offset_bottom_rois++;
154 T roi_start_w = offset_bottom_rois[0] * spatial_scale;
155 T roi_start_h = offset_bottom_rois[1] * spatial_scale;
156 T roi_end_w = offset_bottom_rois[2] * spatial_scale;
157 T roi_end_h = offset_bottom_rois[3] * spatial_scale;
164 T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
165 T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
166 T bin_size_h =
static_cast<T
>(roi_height) / static_cast<T>(pooled_height);
167 T bin_size_w =
static_cast<T
>(roi_width) / static_cast<T>(pooled_width);
170 int roi_bin_grid_h = (sampling_ratio > 0)
172 : ceil(roi_height / pooled_height);
174 (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
177 const T count = roi_bin_grid_h * roi_bin_grid_w;
181 std::vector<PreCalc<T>> pre_calc(
182 roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
183 pre_calc_for_bilinear_interpolate(
198 if (order == StorageOrder::NCHW) {
199 for (
int c = 0; c < channels; c++) {
200 int index_n_c = index_n + c * pooled_width * pooled_height;
201 const T* offset_bottom_data =
202 bottom_data + (roi_batch_ind * channels + c) * height * width;
203 int pre_calc_index = 0;
205 for (
int ph = 0; ph < pooled_height; ph++) {
206 for (
int pw = 0; pw < pooled_width; pw++) {
207 int index = index_n_c + ph * pooled_width + pw;
210 for (
int iy = 0; iy < roi_bin_grid_h; iy++) {
211 for (
int ix = 0; ix < roi_bin_grid_w; ix++) {
212 PreCalc<T> pc = pre_calc[pre_calc_index];
213 output_val += pc.w1 * offset_bottom_data[pc.pos1] +
214 pc.w2 * offset_bottom_data[pc.pos2] +
215 pc.w3 * offset_bottom_data[pc.pos3] +
216 pc.w4 * offset_bottom_data[pc.pos4];
223 top_data[index] = output_val;
229 if (order == StorageOrder::NHWC) {
230 const T* offset_bottom_data =
231 bottom_data + roi_batch_ind * channels * height * width;
232 int pre_calc_index = 0;
234 for (
int ph = 0; ph < pooled_height; ph++) {
235 for (
int pw = 0; pw < pooled_width; pw++) {
236 EVecXf output_vals = EVecXf::Zero(channels);
238 for (
int iy = 0; iy < roi_bin_grid_h; iy++) {
239 for (
int ix = 0; ix < roi_bin_grid_w; ix++) {
240 PreCalc<T> pc = pre_calc[pre_calc_index];
242 ConstEigenVectorMap<T> data_1(
243 offset_bottom_data + channels * pc.pos1, channels);
244 ConstEigenVectorMap<T> data_2(
245 offset_bottom_data + channels * pc.pos2, channels);
246 ConstEigenVectorMap<T> data_3(
247 offset_bottom_data + channels * pc.pos3, channels);
248 ConstEigenVectorMap<T> data_4(
249 offset_bottom_data + channels * pc.pos4, channels);
251 output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
257 output_vals /= count;
259 int index_nhw = index_n + (ph * pooled_width + pw) * channels;
261 top_data + index_nhw, output_vals.data(), channels *
sizeof(T));
272 bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
279 if (order_ == StorageOrder::NCHW) {
280 Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
281 }
else if (order_ == StorageOrder::NHWC) {
282 Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
285 Y->mutable_data<
float>();
289 CAFFE_ENFORCE_EQ(R.ndim(), 2);
291 CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
293 assert(sampling_ratio_ >= 0);
295 if (order_ == StorageOrder::NCHW) {
296 Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
297 int output_size = Y->size();
298 ROIAlignForward<float>(
310 Y->mutable_data<
float>(),
312 }
else if (order_ == StorageOrder::NHWC) {
313 Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
314 int output_size = Y->size();
315 ROIAlignForward<float>(
327 Y->mutable_data<
float>(),
334 REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
336 #ifdef CAFFE2_HAS_MKL_DNN 337 REGISTER_MKL_OPERATOR(
339 mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>);
340 #endif // CAFFE2_HAS_MKL_DNN 343 OPERATOR_SCHEMA(RoIAlign)
347 Region of Interest (RoI) align operation as used in Mask R-CNN. 351 "(float) default 1.0; Spatial scale of the input feature map X " 352 "relative to the input image. E.g., 0.0625 if X has a stride of 16 " 353 "w.r.t. the input image.")
354 .Arg(
"pooled_h",
"(int) default 1; Pooled output Y's height.")
355 .Arg(
"pooled_w",
"(int) default 1; Pooled output Y's width.")
358 "(int) default -1; number of sampling points in the interpolation grid " 359 "used to compute the output value of each pooled output bin. If > 0, " 360 "then exactly sampling_ratio x sampling_ratio grid points are used. If " 361 "<= 0, then an adaptive number of grid points are used (computed as " 362 "ceil(roi_width / pooled_w), and likewise for height).")
363 .Input(0,
"X",
"4D feature map input of shape (N, C, H, W).")
367 "2D input of shape (R, 4 or 5) specifying R RoIs " 368 "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " 369 "coordinates are in the coordinate system of the input image. For " 370 "inputs corresponding to a single image, batch index can be excluded " 371 "to have just 4 columns.")
375 "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element " 376 "is a pooled feature map cooresponding to the r-th RoI.");
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...