Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_impl.h
1 // conv_op_impl.h is the templated implementation of the conv_op.h file.
2 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
3 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
4 
5 #include "caffe2/core/context.h"
6 #include "caffe2/core/flags.h"
7 #include "caffe2/core/logging.h"
8 #include "caffe2/core/operator.h"
9 #include "caffe2/operators/conv_op.h"
10 #include "caffe2/operators/conv_pool_op_base.h"
11 #include "caffe2/utils/math.h"
12 
13 namespace caffe2 {
14 
15 template <typename T, class Context>
16 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
17  const Tensor<Context>& X = Input(INPUT);
18  auto& filter = Input(FILTER);
19  Tensor<Context>* Y = Output(0);
20  const int N = X.dim32(0), C = X.dim32(1);
21  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
22  const int M = filter.dim32(0);
23  CAFFE_ENFORCE(
24  C == filter.dim32(1) * group_,
25  "Convolution op: input channels does not match: # of input channels ",
26  C,
27  " is not equal to kernel channels * group:",
28  filter.dim32(1),
29  "*",
30  group_);
31  CAFFE_ENFORCE(
32  M % group_ == 0,
33  "The number of output channels is not divisible by group.");
34 
35  int kernel_dims_size = 1;
36  for (int i = 0; i < kernel_.size(); ++i) {
37  CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
38  kernel_dims_size *= kernel_[i];
39  }
40 
41  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
42 
43  const vector<int> input_dims = GetDims(X);
44  const vector<int> output_dims = GetDims(*Y);
45  const int input_image_size = this->GetDimsSize(X);
46  const int output_image_size = this->GetDimsSize(*Y);
47 
48  vector<int> img_shape;
49  img_shape.assign(X.dims().begin() + 1, X.dims().end());
50 
51  vector<int> buffer_shape;
52  buffer_shape.push_back(C / group_ * kernel_dims_size);
53  buffer_shape.insert(
54  buffer_shape.end(), output_dims.begin(), output_dims.end());
55 
56  if (kernel_.size() != 2) {
57  SetDeviceTensor(img_shape, &img_shape_device_);
58  SetDeviceTensor(buffer_shape, &col_buffer_shape_device_);
59  }
60 
61  const int col_buffer_size =
62  (C / group_) * kernel_dims_size * output_image_size;
63 
64  // The dimension of each kernel
65  const int kernel_dim = C / group_ * kernel_dims_size;
66  // The offset corresponding to a single input image, and a single output
67  // image.
68  const int input_offset = C / group_ * input_image_size;
69  const int output_offset = Y->size() / Y->dim32(0) / group_;
70  const int filter_offset = filter.size() / group_;
71 
72  // The col buffer is stored in CHW order as well - kernel_dim, and the height
73  // and width.
74  const T* Xdata = X.template data<T>();
75  if (InputSize() == 3) {
76  const auto& bias = Input(BIAS);
77  CAFFE_ENFORCE(bias.ndim() == 1);
78  CAFFE_ENFORCE(bias.dim32(0) == M);
79  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
80  output_image_size, &bias_multiplier_);
81  }
82  T* Ydata = Y->template mutable_data<T>();
83 
84  auto f = [&](Tensor<Context>* col_buffer) {
85  col_buffer->Resize(buffer_shape);
86  T* col_buffer_data = col_buffer->template mutable_data<T>();
87  // Im2col, followed by gemm.
88  for (int image_id = 0; image_id < N; ++image_id) {
89  for (int group_id = 0; group_id < group_; ++group_id) {
90  if (kernel_.size() == 2) {
91  math::Im2col<T, Context, StorageOrder::NCHW>(
92  Xdata + group_id * input_offset,
93  C / group_,
94  input_dims[0],
95  input_dims[1],
96  kernel_h(),
97  kernel_w(),
98  dilation_h(),
99  dilation_w(),
100  pad_t(),
101  pad_l(),
102  pad_b(),
103  pad_r(),
104  stride_h(),
105  stride_w(),
106  col_buffer_data,
107  &context_);
108  } else {
109  math::Im2colNd<T, Context, StorageOrder::NCHW>(
110  Xdata + group_id * input_offset,
111  img_shape_device_.template data<int>(),
112  col_buffer_shape_device_.template data<int>(),
113  C * input_image_size,
114  col_buffer_size,
115  kernel_device_.template data<int>(),
116  stride_device_.template data<int>(),
117  dilation_device_.template data<int>(),
118  pads_device_.template data<int>(),
119  kernel_.size(),
120  col_buffer_data,
121  &context_);
122  }
123  // Weight term
124  math::Gemm<T, Context>(
125  CblasNoTrans,
126  CblasNoTrans,
127  M / group_,
128  output_image_size,
129  kernel_dim,
130  1,
131  filter.template data<T>() + group_id * filter_offset,
132  col_buffer_data,
133  0,
134  Ydata + group_id * output_offset,
135  &context_);
136  }
137  if (InputSize() == 3) {
138  // Bias term can be carried out outside the group definition
139  // to be efficient.
140  auto* bias_data = Input(BIAS).template data<T>();
141  math::Gemm<T, Context>(
142  CblasNoTrans,
143  CblasNoTrans,
144  M,
145  output_image_size,
146  1,
147  1,
148  bias_data,
149  bias_multiplier_.template data<T>(),
150  1,
151  Ydata,
152  &context_);
153  }
154  Xdata += input_offset * group_;
155  Ydata += output_offset * group_;
156  }
157  };
158 
159  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
160  runWithSharedBuffer<Context>(ws_, f);
161  } else {
162  f(&col_buffer_);
163  }
164  return true;
165 }
166 
167 // The implementations.
168 template <typename T, class Context>
169 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
170  const Tensor<Context>& X = Input(INPUT);
171  auto& filter = Input(FILTER);
172  Tensor<Context>* Y = Output(0);
173  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
174 
175  CAFFE_ENFORCE_EQ(
176  kernel_.size(),
177  2,
178  "Only 2d convolution is supported for NHWC storage type");
179 
180  CAFFE_ENFORCE(X.ndim(), filter.ndim());
181  const int M = filter.dim32(0);
182  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
183  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
184  CAFFE_ENFORCE(filter.dim32(3) == C);
185 
186  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
187  // The dimension of each kernel
188  const int kernel_dim = kernel_h() * kernel_w() * C;
189  // The offset corresponding to a single input image, and a single output
190  // image.
191  const int input_offset = H * W * C;
192  const int output_offset = Y->size() / Y->dim32(0);
193  // The output image size is the spatial size of the output.
194  const int output_image_size = Y->dim32(1) * Y->dim32(2);
195  // The col buffer is stored in HWC order as well - kernel_dim, and the height
196  // and width.
197  const T* Xdata = X.template data<T>();
198  T* Ydata = Y->template mutable_data<T>();
199  // Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
200  // can skip im2col.
201  if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
202  Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
203  pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
204  math::Gemm<T, Context>(
205  CblasNoTrans,
206  CblasTrans,
207  N * H * W,
208  M,
209  C,
210  1,
211  Xdata,
212  filter.template data<T>(),
213  0,
214  Ydata,
215  &context_);
216  if (InputSize() == 3) {
217  auto& bias = Input(BIAS);
218  CAFFE_ENFORCE(1 == bias.ndim());
219  CAFFE_ENFORCE(bias.dim32(0) == M);
220  if (bias_multiplier_.size() != N * H * W) {
221  // If the helper bias multiplier is not M, reshape and fill it with one.
222  bias_multiplier_.Resize(vector<TIndex>(1, N * H * W));
223  math::Set<T, Context>(
224  N * H * W,
225  static_cast<T>(1),
226  bias_multiplier_.template mutable_data<T>(),
227  &context_);
228  }
229  math::Gemm<T, Context>(
230  CblasNoTrans,
231  CblasNoTrans,
232  N * H * W,
233  M,
234  1,
235  1,
236  bias_multiplier_.template data<T>(),
237  bias.template data<T>(),
238  1,
239  Ydata,
240  &context_);
241  }
242  } else {
243  if (InputSize() == 3) {
244  const auto& bias = Input(BIAS);
245  CAFFE_ENFORCE(1 == bias.ndim());
246  CAFFE_ENFORCE(bias.dim32(0) == M);
247  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
248  output_image_size, &bias_multiplier_);
249  }
250  auto f = [&](Tensor<Context>* col_buffer) {
251  col_buffer->Resize(
252  vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
253  T* col_buffer_data = col_buffer->template mutable_data<T>();
254  // Im2col, followed by gemm.
255  for (int image_id = 0; image_id < N; ++image_id) {
256  math::Im2col<T, Context, StorageOrder::NHWC>(
257  Xdata,
258  C,
259  H,
260  W,
261  kernel_h(),
262  kernel_w(),
263  dilation_h(),
264  dilation_w(),
265  pad_t(),
266  pad_l(),
267  pad_b(),
268  pad_r(),
269  stride_h(),
270  stride_w(),
271  col_buffer_data,
272  &context_);
273  // Weight term
274  math::Gemm<T, Context>(
275  CblasNoTrans,
276  CblasTrans,
277  output_image_size,
278  M,
279  kernel_dim,
280  1,
281  col_buffer_data,
282  filter.template data<T>(),
283  0,
284  Ydata,
285  &context_);
286  if (InputSize() == 3) {
287  // Bias term
288  math::Gemm<T, Context>(
289  CblasNoTrans,
290  CblasNoTrans,
291  output_image_size,
292  M,
293  1,
294  1,
295  bias_multiplier_.template data<T>(),
296  Input(BIAS).template data<T>(),
297  1,
298  Ydata,
299  &context_);
300  }
301  Xdata += input_offset;
302  Ydata += output_offset;
303  }
304  };
305  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
306  runWithSharedBuffer<Context>(ws_, f);
307  } else {
308  f(&col_buffer_);
309  }
310  }
311  return true;
312 }
313 
314 template <typename T, class Context>
315 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
316  auto& X = Input(INPUT);
317  auto& filter = Input(FILTER);
318  auto& dY = Input(OUTPUT_GRAD);
319  auto* dfilter = Output(FILTER_GRAD);
320  const int N = X.dim32(0), C = X.dim32(1);
321 
322  const vector<int> input_dims = this->GetDims(X);
323  const int input_image_size = this->GetDimsSize(X);
324 
325  const vector<int> output_dims = this->GetDims(dY);
326  // The output image size is the spatial size of the output.
327  const int output_image_size = this->GetDimsSize(dY);
328 
329  ConvPoolOpBase<Context>::ComputePads(input_dims);
330  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
331  const int M = filter.dim32(0);
332  CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
333 
334  int kernel_dims_size = 1;
335  for (int i = 0; i < kernel_.size(); ++i) {
336  CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
337  kernel_dims_size *= kernel_[i];
338  }
339 
340  CAFFE_ENFORCE(M % group_ == 0);
341  dfilter->ResizeLike(filter);
342  // The dimension of each kernel
343  const int kernel_dim = C / group_ * kernel_dims_size;
344  // The offset corresponding to a single input image, and a single output
345  // image.
346  const int input_offset = C / group_ * input_image_size;
347  const int output_offset = dY.size() / dY.dim32(0) / group_;
348  const int filter_offset = filter.size() / group_;
349  // The col buffer is stored in CHW order as well - kernel_dim, and the height
350  // and width.
351 
352  vector<int> img_shape;
353  img_shape.assign(X.dims().begin() + 1, X.dims().end());
354  vector<int> col_buffer_shape;
355  col_buffer_shape.push_back(C / group_ * kernel_dims_size);
356  col_buffer_shape.insert(
357  col_buffer_shape.end(), output_dims.begin(), output_dims.end());
358  col_buffer_.Resize(col_buffer_shape);
359 
360  if (kernel_.size() != 2) {
361  SetDeviceTensor(img_shape, &img_shape_device_);
362  SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
363  }
364 
365  const int col_buffer_size =
366  (C / group_) * kernel_dims_size * output_image_size;
367  const T* Xdata = X.template data<T>();
368  const T* filter_data = filter.template data<T>();
369  const T* dYdata = dY.template data<T>();
370  T* col_buffer_data = col_buffer_.template mutable_data<T>();
371  T* dfilter_data = dfilter->template mutable_data<T>();
372 
373  // Pre-setting the gradients to zero.
374  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
375 
376  T* dbias_data = nullptr;
377  if (!no_bias_) {
378  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
379  dbias->Resize(M);
380  if (bias_multiplier_.size() != output_image_size) {
381  // If the helper bias multiplier is not M, reshape and fill it with one.
382  bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
383  math::Set<T, Context>(
384  output_image_size,
385  static_cast<T>(1),
386  bias_multiplier_.template mutable_data<T>(),
387  &context_);
388  }
389  dbias_data = dbias->template mutable_data<T>();
390  math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
391  }
392 
393  for (int image_id = 0; image_id < N; ++image_id) {
394  for (int group_id = 0; group_id < group_; ++group_id) {
395  // When we compute the gradient with respect to the filters, we need to do
396  // im2col to allow gemm-type computation.
397  if (kernel_.size() == 2) {
398  math::Im2col<T, Context, StorageOrder::NCHW>(
399  Xdata + group_id * input_offset,
400  C / group_,
401  input_dims[0],
402  input_dims[1],
403  kernel_h(),
404  kernel_w(),
405  dilation_h(),
406  dilation_w(),
407  pad_t(),
408  pad_l(),
409  pad_b(),
410  pad_r(),
411  stride_h(),
412  stride_w(),
413  col_buffer_data,
414  &context_);
415  } else {
416  math::Im2colNd<T, Context, StorageOrder::NCHW>(
417  Xdata + group_id * input_offset,
418  img_shape_device_.template data<int>(),
419  col_buffer_shape_device_.template data<int>(),
420  C * input_image_size,
421  col_buffer_size,
422  kernel_device_.template data<int>(),
423  stride_device_.template data<int>(),
424  dilation_device_.template data<int>(),
425  pads_device_.template data<int>(),
426  kernel_.size(),
427  col_buffer_data,
428  &context_);
429  }
430  // Gradient with respect to filter.
431  math::Gemm<T, Context>(
432  CblasNoTrans,
433  CblasTrans,
434  M / group_,
435  kernel_dim,
436  output_image_size,
437  1,
438  dYdata + group_id * output_offset,
439  col_buffer_data,
440  1,
441  dfilter_data + group_id * filter_offset,
442  &context_);
443  }
444  if (!no_bias_) {
445  // Gradient with respect to bias can be computed independent from group.
446  math::Gemv<T, Context>(
447  CblasNoTrans,
448  M,
449  output_image_size,
450  1,
451  dYdata,
452  bias_multiplier_.template data<T>(),
453  1,
454  dbias_data,
455  &context_);
456  }
457  Xdata += input_offset * group_;
458  dYdata += output_offset * group_;
459  }
460  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
461  // Compute the gradient w.r.t. the input.
462  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
463  dX->ResizeLike(X);
464  T* dXdata = dX->template mutable_data<T>();
465  dYdata = dY.template data<T>();
466  for (int image_id = 0; image_id < N; ++image_id) {
467  for (int group_id = 0; group_id < group_; ++group_id) {
468  // Compute gradient into col_buffer.
469  math::Gemm<T, Context>(
470  CblasTrans,
471  CblasNoTrans,
472  kernel_dim,
473  output_image_size,
474  M / group_,
475  1,
476  filter_data + group_id * filter_offset,
477  dYdata,
478  0,
479  col_buffer_data,
480  &context_);
481  if (kernel_.size() == 2) {
482  math::Col2im<T, Context, StorageOrder::NCHW>(
483  col_buffer_data,
484  C / group_,
485  input_dims[0],
486  input_dims[1],
487  kernel_h(),
488  kernel_w(),
489  dilation_h(),
490  dilation_w(),
491  pad_t(),
492  pad_l(),
493  pad_b(),
494  pad_r(),
495  stride_h(),
496  stride_w(),
497  dXdata,
498  &context_);
499  } else {
500  math::Col2imNd<T, Context, StorageOrder::NCHW>(
501  col_buffer_data,
502  img_shape_device_.template data<int>(),
503  col_buffer_shape_device_.template data<int>(),
504  C * input_image_size,
505  col_buffer_size,
506  kernel_device_.template data<int>(),
507  stride_device_.template data<int>(),
508  dilation_device_.template data<int>(),
509  pads_device_.template data<int>(),
510  kernel_.size(),
511  dXdata,
512  &context_);
513  }
514  dXdata += input_offset;
515  dYdata += output_offset;
516  }
517  }
518  }
519  return true;
520 }
521 
522 template <typename T, class Context>
523 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
524  auto& X = Input(INPUT);
525  auto& filter = Input(FILTER);
526  auto& dY = Input(OUTPUT_GRAD);
527  auto* dfilter = Output(FILTER_GRAD);
528 
529  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
530  ConvPoolOpBase<Context>::ComputePads({H, W});
531  CAFFE_ENFORCE(4 == filter.ndim());
532  const int M = filter.dim32(0);
533  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
534  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
535  CAFFE_ENFORCE(filter.dim32(3) == C);
536  dfilter->ResizeLike(filter);
537 
538  // The dimension of each kernel
539  const int kernel_dim = kernel_h() * kernel_w() * C;
540  // The offset corresponding to a single input image, and a single output
541  // image.
542  const int input_offset = H * W * C;
543  const int output_offset = dY.size() / dY.dim32(0);
544  // The output image size is the spatial size of the output.
545  const int output_image_size = dY.dim32(1) * dY.dim32(2);
546  // The col buffer is stored in CHW order as well - kernel_dim, and the height
547  // and width.
548  col_buffer_.Resize(output_image_size, kernel_dim);
549 
550  const T* Xdata = X.template data<T>();
551  const T* const filter_data = filter.template data<T>();
552  const T* const dYdata = dY.template data<T>();
553  T* col_buffer_data = col_buffer_.template mutable_data<T>();
554  T* dfilter_data = dfilter->template mutable_data<T>();
555 
556  // Pre-setting the gradients to zero.
557  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
558 
559  T* dbias_data = nullptr;
560  if (!no_bias_) {
561  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
562  dbias->Resize(M);
563  dbias_data = dbias->template mutable_data<T>();
564  math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
565  if (bias_multiplier_.size() != output_image_size) {
566  // If the helper bias multiplier is not M, reshape and fill it with one.
567  bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
568  math::Set<T, Context>(
569  output_image_size,
570  static_cast<T>(1),
571  bias_multiplier_.template mutable_data<T>(),
572  &context_);
573  }
574  }
575 
576  for (int image_id = 0; image_id < N; ++image_id) {
577  // When we compute the gradient with respect to the filters, we need to do
578  // im2col to allow gemm-type computation.
579  math::Im2col<T, Context, StorageOrder::NHWC>(
580  Xdata,
581  C,
582  H,
583  W,
584  kernel_h(),
585  kernel_w(),
586  dilation_h(),
587  dilation_w(),
588  pad_t(),
589  pad_l(),
590  pad_b(),
591  pad_r(),
592  stride_h(),
593  stride_w(),
594  col_buffer_data,
595  &context_);
596  // Gradient with respect to filter.
597  math::Gemm<T, Context>(
598  CblasTrans,
599  CblasNoTrans,
600  M,
601  kernel_dim,
602  output_image_size,
603  1,
604  dYdata + output_offset * image_id,
605  col_buffer_data,
606  1,
607  dfilter_data,
608  &context_);
609  if (!no_bias_) {
610  // Gradient with respect to bias
611  math::Gemv<T, Context>(
612  CblasTrans,
613  output_image_size,
614  M,
615  1,
616  dYdata + output_offset * image_id,
617  bias_multiplier_.template data<T>(),
618  1,
619  dbias_data,
620  &context_);
621  }
622  Xdata += input_offset;
623  }
624 
625  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
626  // Compute the gradient w.r.t. the input.
627  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
628  dX->ResizeLike(X);
629  T* dXdata = dX->template mutable_data<T>();
630  for (int image_id = 0; image_id < N; ++image_id) {
631  // Compute gradient into col_buffer.
632  math::Gemm<T, Context>(
633  CblasNoTrans,
634  CblasNoTrans,
635  output_image_size,
636  kernel_dim,
637  M,
638  1,
639  dYdata + output_offset * image_id,
640  filter_data,
641  0,
642  col_buffer_data,
643  &context_);
644  math::Col2im<T, Context, StorageOrder::NHWC>(
645  col_buffer_data,
646  C,
647  H,
648  W,
649  kernel_h(),
650  kernel_w(),
651  dilation_h(),
652  dilation_w(),
653  pad_t(),
654  pad_l(),
655  pad_b(),
656  pad_r(),
657  stride_h(),
658  stride_w(),
659  dXdata,
660  &context_);
661  dXdata += input_offset;
662  }
663  }
664  return true;
665 }
666 } // namespace caffe2
667 
668 #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Commandline flags support for Caffe2.