2 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_ 3 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_ 5 #include "caffe2/core/context.h" 7 #include "caffe2/core/logging.h" 8 #include "caffe2/core/operator.h" 9 #include "caffe2/operators/conv_op.h" 10 #include "caffe2/operators/conv_pool_op_base.h" 11 #include "caffe2/utils/math.h" 15 template <
typename T,
class Context>
16 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
18 auto& filter = Input(FILTER);
20 const int N = X.dim32(0), C = X.dim32(1);
21 CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
22 const int M = filter.dim32(0);
24 C == filter.dim32(1) * group_,
25 "Convolution op: input channels does not match: # of input channels ",
27 " is not equal to kernel channels * group:",
33 "The number of output channels is not divisible by group.");
35 int kernel_dims_size = 1;
36 for (
int i = 0; i < kernel_.size(); ++i) {
37 CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
38 kernel_dims_size *= kernel_[i];
41 ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
43 const vector<int> input_dims = GetDims(X);
44 const vector<int> output_dims = GetDims(*Y);
45 const int input_image_size = this->GetDimsSize(X);
46 const int output_image_size = this->GetDimsSize(*Y);
48 vector<int> img_shape;
49 img_shape.assign(X.dims().begin() + 1, X.dims().end());
51 vector<int> buffer_shape;
52 buffer_shape.push_back(C / group_ * kernel_dims_size);
54 buffer_shape.end(), output_dims.begin(), output_dims.end());
56 if (kernel_.size() != 2) {
57 SetDeviceTensor(img_shape, &img_shape_device_);
58 SetDeviceTensor(buffer_shape, &col_buffer_shape_device_);
61 const int col_buffer_size =
62 (C / group_) * kernel_dims_size * output_image_size;
65 const int kernel_dim = C / group_ * kernel_dims_size;
68 const int input_offset = C / group_ * input_image_size;
69 const int output_offset = Y->size() / Y->dim32(0) / group_;
70 const int filter_offset = filter.size() / group_;
74 const T* Xdata = X.template data<T>();
75 if (InputSize() == 3) {
76 const auto& bias = Input(BIAS);
77 CAFFE_ENFORCE(bias.ndim() == 1);
78 CAFFE_ENFORCE(bias.dim32(0) == M);
79 ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
80 output_image_size, &bias_multiplier_);
82 T* Ydata = Y->template mutable_data<T>();
85 col_buffer->Resize(buffer_shape);
86 T* col_buffer_data = col_buffer->template mutable_data<T>();
88 for (
int image_id = 0; image_id < N; ++image_id) {
89 for (
int group_id = 0; group_id < group_; ++group_id) {
90 if (kernel_.size() == 2) {
91 math::Im2col<T, Context, StorageOrder::NCHW>(
92 Xdata + group_id * input_offset,
109 math::Im2colNd<T, Context, StorageOrder::NCHW>(
110 Xdata + group_id * input_offset,
111 img_shape_device_.template data<int>(),
112 col_buffer_shape_device_.template data<int>(),
113 C * input_image_size,
115 kernel_device_.template data<int>(),
116 stride_device_.template data<int>(),
117 dilation_device_.template data<int>(),
118 pads_device_.template data<int>(),
124 math::Gemm<T, Context>(
131 filter.template data<T>() + group_id * filter_offset,
134 Ydata + group_id * output_offset,
137 if (InputSize() == 3) {
140 auto* bias_data = Input(BIAS).template data<T>();
141 math::Gemm<T, Context>(
149 bias_multiplier_.template data<T>(),
154 Xdata += input_offset * group_;
155 Ydata += output_offset * group_;
159 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
160 runWithSharedBuffer<Context>(ws_, f);
168 template <
typename T,
class Context>
169 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
171 auto& filter = Input(FILTER);
173 const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
178 "Only 2d convolution is supported for NHWC storage type");
180 CAFFE_ENFORCE(X.ndim(), filter.ndim());
181 const int M = filter.dim32(0);
182 CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
183 CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
184 CAFFE_ENFORCE(filter.dim32(3) == C);
186 ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
188 const int kernel_dim = kernel_h() * kernel_w() * C;
191 const int input_offset = H * W * C;
192 const int output_offset = Y->size() / Y->dim32(0);
194 const int output_image_size = Y->dim32(1) * Y->dim32(2);
197 const T* Xdata = X.template data<T>();
198 T* Ydata = Y->template mutable_data<T>();
201 if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
202 Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
203 pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
204 math::Gemm<T, Context>(
212 filter.template data<T>(),
216 if (InputSize() == 3) {
217 auto& bias = Input(BIAS);
218 CAFFE_ENFORCE(1 == bias.ndim());
219 CAFFE_ENFORCE(bias.dim32(0) == M);
220 if (bias_multiplier_.size() != N * H * W) {
222 bias_multiplier_.Resize(vector<TIndex>(1, N * H * W));
223 math::Set<T, Context>(
226 bias_multiplier_.template mutable_data<T>(),
229 math::Gemm<T, Context>(
236 bias_multiplier_.template data<T>(),
237 bias.template data<T>(),
243 if (InputSize() == 3) {
244 const auto& bias = Input(BIAS);
245 CAFFE_ENFORCE(1 == bias.ndim());
246 CAFFE_ENFORCE(bias.dim32(0) == M);
247 ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
248 output_image_size, &bias_multiplier_);
252 vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
253 T* col_buffer_data = col_buffer->template mutable_data<T>();
255 for (
int image_id = 0; image_id < N; ++image_id) {
256 math::Im2col<T, Context, StorageOrder::NHWC>(
274 math::Gemm<T, Context>(
282 filter.template data<T>(),
286 if (InputSize() == 3) {
288 math::Gemm<T, Context>(
295 bias_multiplier_.template data<T>(),
296 Input(BIAS).template data<T>(),
301 Xdata += input_offset;
302 Ydata += output_offset;
305 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
306 runWithSharedBuffer<Context>(ws_, f);
314 template <
typename T,
class Context>
315 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
316 auto& X = Input(INPUT);
317 auto& filter = Input(FILTER);
318 auto& dY = Input(OUTPUT_GRAD);
319 auto* dfilter = Output(FILTER_GRAD);
320 const int N = X.dim32(0), C = X.dim32(1);
322 const vector<int> input_dims = this->GetDims(X);
323 const int input_image_size = this->GetDimsSize(X);
325 const vector<int> output_dims = this->GetDims(dY);
327 const int output_image_size = this->GetDimsSize(dY);
329 ConvPoolOpBase<Context>::ComputePads(input_dims);
330 CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
331 const int M = filter.dim32(0);
332 CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
334 int kernel_dims_size = 1;
335 for (
int i = 0; i < kernel_.size(); ++i) {
336 CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
337 kernel_dims_size *= kernel_[i];
340 CAFFE_ENFORCE(M % group_ == 0);
341 dfilter->ResizeLike(filter);
343 const int kernel_dim = C / group_ * kernel_dims_size;
346 const int input_offset = C / group_ * input_image_size;
347 const int output_offset = dY.size() / dY.dim32(0) / group_;
348 const int filter_offset = filter.size() / group_;
352 vector<int> img_shape;
353 img_shape.assign(X.dims().begin() + 1, X.dims().end());
354 vector<int> col_buffer_shape;
355 col_buffer_shape.push_back(C / group_ * kernel_dims_size);
356 col_buffer_shape.insert(
357 col_buffer_shape.end(), output_dims.begin(), output_dims.end());
358 col_buffer_.Resize(col_buffer_shape);
360 if (kernel_.size() != 2) {
361 SetDeviceTensor(img_shape, &img_shape_device_);
362 SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
365 const int col_buffer_size =
366 (C / group_) * kernel_dims_size * output_image_size;
367 const T* Xdata = X.template data<T>();
368 const T* filter_data = filter.template data<T>();
369 const T* dYdata = dY.template data<T>();
370 T* col_buffer_data = col_buffer_.template mutable_data<T>();
371 T* dfilter_data = dfilter->template mutable_data<T>();
374 math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
376 T* dbias_data =
nullptr;
378 auto* dbias = Output(BIAS_OR_INPUT_GRAD);
380 if (bias_multiplier_.size() != output_image_size) {
382 bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
383 math::Set<T, Context>(
386 bias_multiplier_.template mutable_data<T>(),
389 dbias_data = dbias->template mutable_data<T>();
390 math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
393 for (
int image_id = 0; image_id < N; ++image_id) {
394 for (
int group_id = 0; group_id < group_; ++group_id) {
397 if (kernel_.size() == 2) {
398 math::Im2col<T, Context, StorageOrder::NCHW>(
399 Xdata + group_id * input_offset,
416 math::Im2colNd<T, Context, StorageOrder::NCHW>(
417 Xdata + group_id * input_offset,
418 img_shape_device_.template data<int>(),
419 col_buffer_shape_device_.template data<int>(),
420 C * input_image_size,
422 kernel_device_.template data<int>(),
423 stride_device_.template data<int>(),
424 dilation_device_.template data<int>(),
425 pads_device_.template data<int>(),
431 math::Gemm<T, Context>(
438 dYdata + group_id * output_offset,
441 dfilter_data + group_id * filter_offset,
446 math::Gemv<T, Context>(
452 bias_multiplier_.template data<T>(),
457 Xdata += input_offset * group_;
458 dYdata += output_offset * group_;
460 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
462 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
464 T* dXdata = dX->template mutable_data<T>();
465 dYdata = dY.template data<T>();
466 for (
int image_id = 0; image_id < N; ++image_id) {
467 for (
int group_id = 0; group_id < group_; ++group_id) {
469 math::Gemm<T, Context>(
476 filter_data + group_id * filter_offset,
481 if (kernel_.size() == 2) {
482 math::Col2im<T, Context, StorageOrder::NCHW>(
500 math::Col2imNd<T, Context, StorageOrder::NCHW>(
502 img_shape_device_.template data<int>(),
503 col_buffer_shape_device_.template data<int>(),
504 C * input_image_size,
506 kernel_device_.template data<int>(),
507 stride_device_.template data<int>(),
508 dilation_device_.template data<int>(),
509 pads_device_.template data<int>(),
514 dXdata += input_offset;
515 dYdata += output_offset;
522 template <
typename T,
class Context>
523 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
524 auto& X = Input(INPUT);
525 auto& filter = Input(FILTER);
526 auto& dY = Input(OUTPUT_GRAD);
527 auto* dfilter = Output(FILTER_GRAD);
529 const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
530 ConvPoolOpBase<Context>::ComputePads({H, W});
531 CAFFE_ENFORCE(4 == filter.ndim());
532 const int M = filter.dim32(0);
533 CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
534 CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
535 CAFFE_ENFORCE(filter.dim32(3) == C);
536 dfilter->ResizeLike(filter);
539 const int kernel_dim = kernel_h() * kernel_w() * C;
542 const int input_offset = H * W * C;
543 const int output_offset = dY.size() / dY.dim32(0);
545 const int output_image_size = dY.dim32(1) * dY.dim32(2);
548 col_buffer_.Resize(output_image_size, kernel_dim);
550 const T* Xdata = X.template data<T>();
551 const T*
const filter_data = filter.template data<T>();
552 const T*
const dYdata = dY.template data<T>();
553 T* col_buffer_data = col_buffer_.template mutable_data<T>();
554 T* dfilter_data = dfilter->template mutable_data<T>();
557 math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
559 T* dbias_data =
nullptr;
561 auto* dbias = Output(BIAS_OR_INPUT_GRAD);
563 dbias_data = dbias->template mutable_data<T>();
564 math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
565 if (bias_multiplier_.size() != output_image_size) {
567 bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
568 math::Set<T, Context>(
571 bias_multiplier_.template mutable_data<T>(),
576 for (
int image_id = 0; image_id < N; ++image_id) {
579 math::Im2col<T, Context, StorageOrder::NHWC>(
597 math::Gemm<T, Context>(
604 dYdata + output_offset * image_id,
611 math::Gemv<T, Context>(
616 dYdata + output_offset * image_id,
617 bias_multiplier_.template data<T>(),
622 Xdata += input_offset;
625 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
627 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
629 T* dXdata = dX->template mutable_data<T>();
630 for (
int image_id = 0; image_id < N; ++image_id) {
632 math::Gemm<T, Context>(
639 dYdata + output_offset * image_id,
644 math::Col2im<T, Context, StorageOrder::NHWC>(
661 dXdata += input_offset;
668 #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_ A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Commandline flags support for Caffe2.