Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_cudnn.cc
1 #include "caffe2/core/context_gpu.h"
2 
3 #include "caffe2/core/common_gpu.h"
4 #include "caffe2/core/cudnn_wrappers.h"
5 #include "caffe2/operators/conv_op.h"
6 #include "caffe2/operators/conv_op_cache_cudnn.h"
7 #include "caffe2/operators/conv_pool_op_base.h"
8 #include "caffe2/operators/op_utils_cudnn.h"
9 
10 namespace caffe2 {
11 
12 class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
13  public:
14  CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
15  : ConvPoolOpBase<CUDAContext>(operator_def, ws),
16  cudnn_wrapper_(&context_),
17  cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
18  "ws_nbytes_limit",
19  kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
20  exhaustive_search_(
21  OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
22  deterministic_(
23  OperatorBase::GetSingleArgument<int>("deterministic", 0)),
24  cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
25  force_algo_(OperatorBase::GetRepeatedArgument<int>("force_algo", vector<int>{-1,-1,-1})),
26  enable_tensor_core_(OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
27  CHECK(!deterministic_ || !exhaustive_search_);
28  CAFFE_ENFORCE(group_ > 0);
29  CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
30  for (int i = 0; i < kernel_.size(); ++i) {
31  OPERATOR_NEEDS_FEATURE(
32  pads_[i] == pads_[kernel_.size() + i],
33  "The current padding scheme leads to unequal padding on the left "
34  "and right, which is not supported by cudnn.");
35  }
36  // dilated convolution supported by some algorithms in cuDNN v6
37 #if !(CUDNN_VERSION_MIN(6,0,0))
38  OPERATOR_NEEDS_FEATURE(
39  dilation_h() == 1 && dilation_w() == 1,
40  "The cudnn convolution does not support dilation yet.");
41 #endif
42 
43 #if CUDNN_VERSION_MIN(7, 0, 0)
44  // verify TensorCore math is supported
45  enable_tensor_core_ &= TensorCoreAvailable();
46 #else
47  enable_tensor_core_ = false;
48 #endif
49 
50  bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
51  OperatorBase::HasArgument("force_algo_dgrad") ||
52  OperatorBase::HasArgument("force_algo_wgrad");
53  if (OperatorBase::HasArgument("force_algo")) {
54  CAFFE_ENFORCE(!individual_force_algo,
55  "Cannot specify both force_algo and any of",
56  "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
57  } else {
58  force_algo_ = std::vector<int>{-1,-1,-1};
59  force_algo_[ALGO_FWD] =
60  OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
61  force_algo_[ALGO_DGRAD] =
62  OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
63  force_algo_[ALGO_WGRAD] =
64  OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
65  }
66 
67  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
68  CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
69  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
70  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
71  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
72  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
73  }
74 
75  ~CudnnConvOpBase() {
76  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
77  CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
78  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
79  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
80  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
81  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
82  }
83 
84  protected:
85  // A helper function to set up the tensor Nd desriptor, depending on the order
86  // the group and the type given.
87  template <typename T>
88  void SetTensorNdDescriptorWithGroup(
89  int size,
90  cudnnTensorDescriptor_t tensorDesc,
91  int N,
92  int C,
93  int H,
94  int W,
95  int D) {
96 #if CUDNN_VERSION_MIN(7, 0, 0)
97  const int CC = C;
98 #else
99  const int CC = C / group_;
100 #endif
101  switch (order_) {
102  case StorageOrder::NHWC:
103  if (size == 4) {
104  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
105  tensorDesc,
107  N,
108  CC,
109  H,
110  W,
111  H * W * C,
112  1,
113  W * C,
114  C));
115  } else {
116  vector<int> dims = {N, H, W, D, CC};
117  vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
118  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
119  tensorDesc,
121  size > 3 ? size : 4,
122  dims.data(),
123  strides.data()));
124  }
125  break;
126  case StorageOrder::NCHW:
127  if (size == 4) {
128  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
129  tensorDesc,
131  N,
132  CC,
133  H,
134  W,
135  C * H * W,
136  H * W,
137  W,
138  1));
139  } else {
140  vector<int> dims = {N, CC, H, W, D};
141  vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
142  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
143  tensorDesc,
145  size > 3 ? size : 4,
146  dims.data(),
147  strides.data()));
148  }
149  break;
150  default:
151  LOG(FATAL) << "Unknown storage order: " << order_;
152  }
153  }
154 
155  void DuplicateConvDesc(
156  cudnnConvolutionDescriptor_t input,
157  size_t kernelDims,
158  size_t dilationDims,
159  cudnnConvolutionDescriptor_t copy) {
160  if (kernelDims == 2) {
161  cudnnConvolutionMode_t mode;
162  cudnnDataType_t dataType;
163  int pad_height = 0;
164  int pad_width = 0;
165  int stride_height = 0;
166  int stride_width = 0;
167  int dilation_height = 0;
168  int dilation_width = 0;
169 
170 #if CUDNN_VERSION_MIN(6, 0, 0)
171  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
172  input,
173  &pad_height,
174  &pad_width,
175  &stride_height,
176  &stride_width,
177  &dilation_height,
178  &dilation_width,
179  &mode,
180  &dataType
181  ));
182 #else
183  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
184  input,
185  &pad_height,
186  &pad_width,
187  &stride_height,
188  &stride_width,
189  &dilation_height,
190  &dilation_width,
191  &mode
192  ));
193 #endif
194 
195 #if CUDNN_VERSION_MIN(6, 0, 0)
196  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
197  copy,
198  pad_height,
199  pad_width,
200  stride_height,
201  stride_width,
202  dilation_height,
203  dilation_width,
204  mode,
205  dataType
206  ));
207 #else
208  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
209  copy,
210  pad_height,
211  pad_width,
212  stride_height,
213  stride_width,
214  dilation_height,
215  dilation_width,
216  mode
217  ));
218 #endif
219  } else {
220  cudnnConvolutionMode_t mode;
221  cudnnDataType_t dataType;
222  int arrayLength = 0;
223  vector<int> ones(dilationDims, 1);
224  CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
225  input,
226  kernel_.size(),
227  &arrayLength,
228  pads_.data(),
229  stride_.data(),
230  ones.data(),
231  &mode,
232  &dataType));
233 
234  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
235  copy,
236  kernel_.size(),
237  pads_.data(),
238  stride_.data(),
239  ones.data(),
240  mode,
241  dataType));
242  }
243  }
244 
245  template <typename T>
246  cudnnDataType_t DetermineComputeTypeFromInput(const T& X) {
247  const cudaDeviceProp& prop = GetDeviceProperty(0);
248  cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
249  if (X.template IsType<float16>()) {
250  if (float16_compute_ && prop.major >= 6) {
251  VLOG(1) << "CUDNN Convolution: float16_compute specified and "
252  << "supported, input data is float16 - using float16 "
253  << "compute.";
254  computeType = CUDNN_DATA_HALF;
255  } else if (float16_compute_) {
256  VLOG(1) << "CUDNN Convolution: float16_compute specified but"
257  << "not supported, input data is float16 - using float32 "
258  << "compute.";
259  } else {
260  VLOG(1) << "CUDNN Convolution: float16_compute not specified but "
261  << "input data is float16 - using float32 compute.";
262  }
263  } else {
264  VLOG(1) << "CUDNN Convolution: using float32 compute.";
265  }
266  return computeType;
267  }
268 
269  void SetConvDescFromArguments() {
270 #if CUDNN_VERSION_MIN(6, 0, 0)
271  if (kernel_.size() == 2) {
272  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
273  conv_desc_,
274  pad_t(),
275  pad_l(),
276  stride_h(),
277  stride_w(),
278  dilation_h(),
279  dilation_w(),
280  CUDNN_CROSS_CORRELATION,
281  compute_type_));
282  } else {
283  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
284  conv_desc_,
285  kernel_.size(),
286  pads_.data(),
287  stride_.data(),
288  dilation_.data(),
289  CUDNN_CROSS_CORRELATION,
290  compute_type_));
291  }
292 #else
293  if (kernel_.size() == 2) {
294  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
295  conv_desc_,
296  pad_t(),
297  pad_l(),
298  stride_h(),
299  stride_w(),
300  1,
301  1,
302  CUDNN_CROSS_CORRELATION));
303  } else {
304  vector<int> ones(dilation_.size(), 1);
305  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
306  conv_desc_,
307  kernel_.size(),
308  pads_.data(),
309  stride_.data(),
310  ones.data(),
311  CUDNN_CROSS_CORRELATION,
312  compute_type_));
313  }
314 #endif
315  }
316 
317  void SetConvDescComputeType(
318  cudnnConvolutionDescriptor_t conv_desc,
319  cudnnDataType_t math) {
320  if (kernel_.size() == 2) {
321  cudnnConvolutionMode_t mode;
322  cudnnDataType_t dataType;
323  int pad_height = 0;
324  int pad_width = 0;
325  int stride_height = 0;
326  int stride_width = 0;
327  int dilation_height = 0;
328  int dilation_width = 0;
329 
330 #if CUDNN_VERSION_MIN(6, 0, 0)
331  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
332  conv_desc,
333  &pad_height,
334  &pad_width,
335  &stride_height,
336  &stride_width,
337  &dilation_height,
338  &dilation_width,
339  &mode,
340  &dataType
341  ));
342 #else
343  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
344  conv_desc,
345  &pad_height,
346  &pad_width,
347  &stride_height,
348  &stride_width,
349  &dilation_height,
350  &dilation_width,
351  &mode
352  ));
353 #endif
354 
355 #if CUDNN_VERSION_MIN(6, 0, 0)
356  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
357  conv_desc,
358  pad_height,
359  pad_width,
360  stride_height,
361  stride_width,
362  dilation_height,
363  dilation_width,
364  mode,
365  math
366  ));
367 #else
368  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
369  conv_desc,
370  pad_height,
371  pad_width,
372  stride_height,
373  stride_width,
374  dilation_height,
375  dilation_width,
376  mode
377  ));
378 #endif
379  } else {
380  cudnnConvolutionMode_t mode;
381  cudnnDataType_t dataType;
382  int arrayLength = 0;
383  vector<int> ones(dilation_.size(), 1);
384  CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
385  conv_desc,
386  kernel_.size(),
387  &arrayLength,
388  pads_.data(),
389  stride_.data(),
390  ones.data(),
391  &mode,
392  &dataType));
393 
394  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
395  conv_desc,
396  kernel_.size(),
397  pads_.data(),
398  stride_.data(),
399  ones.data(),
400  mode,
401  math));
402  }
403  }
404 
405  vector<TIndex> cudnn_input_dims_;
406  vector<TIndex> cudnn_filter_dims_;
407 
408  CuDNNWrapper cudnn_wrapper_;
409  cudnnTensorDescriptor_t bottom_desc_;
410  cudnnFilterDescriptor_t filter_desc_;
411  cudnnTensorDescriptor_t bias_desc_;
412  cudnnTensorDescriptor_t top_desc_;
413  // top desc for bias add in case we do group convolution
414  cudnnTensorDescriptor_t top_desc_for_bias_;
415  cudnnConvolutionDescriptor_t conv_desc_;
416  const size_t cudnn_ws_nbytes_limit_;
417  size_t cudnn_ws_nbytes_;
418  bool exhaustive_search_;
419  bool deterministic_;
420  size_t cudnn_state_;
421  vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
422  bool enable_tensor_core_;
423  cudnnDataType_t compute_type_;
424 };
425 
426 class CudnnConvOp final : public CudnnConvOpBase {
427  public:
428  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
429  : CudnnConvOpBase(operator_def, ws) {}
430 
431  ~CudnnConvOp() {}
432 
433  template <typename T_X, typename T_W, typename T_B, typename T_Y>
434  bool DoRunWithType();
435 
436  bool RunOnDevice() override;
437 
438  private:
439  cudnnConvolutionFwdAlgo_t algo_;
440  using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
442  // Input: X, W, b
443  // Output: Y
444  INPUT_TAGS(INPUT, FILTER, BIAS);
445 };
446 
447 class CudnnConvGradientOp final : public CudnnConvOpBase {
448  public:
449  CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
450  : CudnnConvOpBase(operator_def, ws),
451  no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
452  CAFFE_ENFORCE(
453  !(no_bias_ && OutputSize() == 3),
454  "If bias is not present, you should not have 3 grad output.");
455 
456  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
457  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
458  }
459 
461  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
462  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
463  }
464 
465  template <
466  typename T_X,
467  typename T_DY,
468  typename T_W,
469  typename T_B,
470  typename T_DX,
471  typename T_DW,
472  typename T_DB>
473  bool DoRunWithType();
474 
475  bool RunOnDevice() override;
476 
477  private:
478  cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
479  cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
480  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
481  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
482  using ConvBwdFilterAlgorithmWithCost =
483  std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
484  using ConvBwdDataAlgorithmWithCost =
485  std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
488  bool no_bias_;
489  // input: X, W, dY
490  // output: dW, db, and optionally dX
491  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
492  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
493 };
494 
496 // Implementations
498 
499 static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
500  CUDNN_DATA_FLOAT,
501  CUDNN_DATA_HALF};
502 static constexpr std::array<const char*, 2> kComputePassNames = {
503  "fp32 compute",
504  "fp16 compute"};
505 
506 template <typename T_X, typename T_W, typename T_B, typename T_Y>
507 bool CudnnConvOp::DoRunWithType() {
508  auto& X = Input(INPUT);
509  auto& filter = Input(FILTER);
510  auto* Y = Output(0);
511 
512  // Figure out the output shape
513  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
514  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
515  const int M = filter.dim32(0);
517  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
518  int group_offset_X = 0, group_offset_Y = 0;
519 
520  switch (order_) {
521  case StorageOrder::NHWC:
522  N = X.dim32(0);
523  H = X.dim32(1);
524  W = X.ndim() > 3 ? X.dim32(2) : 1;
525  D = X.ndim() > 4 ? X.dim32(3) : 1;
526  C = X.dim32(X.ndim() - 1);
527  H_out = Y->dim32(1);
528  W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
529  D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
530  for (int i = 0; i < kernel_.size(); ++i) {
531  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
532  }
533  CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
534  group_offset_X = C / group_;
535  group_offset_Y = M / group_;
536  break;
537  case StorageOrder::NCHW:
538  N = X.dim32(0);
539  C = X.dim32(1);
540  H = X.dim32(2);
541  W = X.ndim() > 3 ? X.dim32(3) : 1;
542  D = X.ndim() > 4 ? X.dim32(4) : 1;
543  H_out = Y->dim32(2);
544  W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
545  D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
546  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
547  for (int i = 0; i < kernel_.size(); ++i) {
548  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
549  }
550  group_offset_X = C / group_ * H * W * D;
551  group_offset_Y = M / group_ * H_out * W_out * D_out;
552  break;
553  default:
554  LOG(FATAL) << "Unknown storage order: " << order_;
555  }
556 
557  CAFFE_ENFORCE(
558  C % group_ == 0,
559  "If you set group, the number of input channels should be divisible "
560  "by group.");
561  CAFFE_ENFORCE(
562  M % group_ == 0,
563  "If you set group, the number of output channels should be divisible "
564  "by group.");
565 
566  int group_offset_filter = filter.size() / group_;
567 
568  // Set up the cudnn algorithms & workspace if necessary
569  bool input_changed = (X.dims() != cudnn_input_dims_);
570  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
571  if (input_changed || filter_changed) {
572  VLOG(1) << "Changing the cudnn descriptor configurations.";
573  if (input_changed) {
574  cudnn_input_dims_ = X.dims();
575  SetTensorNdDescriptorWithGroup<T_X>(
576  X.ndim(), bottom_desc_, N, C, H, W, D);
577  }
578  if (filter_changed) {
579  cudnn_filter_dims_ = filter.dims();
580  if (kernel_.size() == 2) {
581 #if CUDNN_VERSION_MIN(7, 0, 0)
582  const int MM = M;
583 #else
584  const int MM = M / group_;
585 #endif
586  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
587  filter_desc_,
589  GetCudnnTensorFormat(order_),
590  MM,
591  C / group_,
592  kernel_h(),
593  kernel_w()));
594  } else {
595  vector<int> dims(filter.dims().begin(), filter.dims().end());
596  dims[0] /= group_;
597 #if !CUDNN_VERSION_MIN(7, 0, 0)
598  order_ == StorageOrder::NCHW ? dims[1] /= group_
599  : dims[filter.ndim() - 1] /= group_;
600 #endif
601  dims[filter.ndim() - 1] /= group_;
602  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
603  filter_desc_,
605  GetCudnnTensorFormat(order_),
606  dims.size(),
607  dims.data()));
608  }
609  if (InputSize() == 3) {
610  if (kernel_.size() == 2) {
611  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
612  bias_desc_,
613  GetCudnnTensorFormat(order_),
615  1,
616  M,
617  1,
618  1));
619  } else {
620  std::vector<int> bias_dims(X.ndim(), 1);
621  bias_dims[1] = M;
622  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
623  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
624  bias_desc_,
626  X.ndim() > 3 ? X.ndim() : 4,
627  bias_dims.data(),
628  strides.data()));
629  }
630  }
631  }
632  // Set the output
633  SetTensorNdDescriptorWithGroup<T_Y>(
634  X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
635  // Set the output with descriptor useful for bias addition in one run.
636  if (kernel_.size() == 2) {
637  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
638  top_desc_for_bias_,
639  GetCudnnTensorFormat(order_),
641  N,
642  M,
643  H_out,
644  W_out));
645  } else {
646  vector<int> dims = {N, M, H_out, W_out, D_out};
647  vector<int> strides = {M * H_out * W_out * D_out,
648  H_out * W_out * D_out,
649  W_out * D_out,
650  D_out,
651  1};
652  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
653  top_desc_for_bias_,
655  X.ndim() > 3 ? X.ndim() : 4,
656  dims.data(),
657  strides.data()));
658  }
659 
660  compute_type_ = DetermineComputeTypeFromInput(X);
661  SetConvDescFromArguments();
662 
663 #if CUDNN_VERSION_MIN(7, 0, 0)
664  if (enable_tensor_core_) {
665  CUDNN_ENFORCE(
666  cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
667  }
668 
669  // enable cuDNN conv groups
670  CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
671 #endif
672 
673  if (force_algo_[ALGO_FWD] >= 0) {
674  algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
675  } else if (deterministic_) {
676  algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
677  } else if (exhaustive_search_) {
678  // Even when FP16 compute is supported and requested, try FP32
679  // because it may be faster. However, if FP32 compute is specified,
680  // FP16 is not a suitable alternative - early out from the loop.
681  std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
682  for (int i = 0; i < 2; i++) {
683  SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
684 
685  algosToCompare[i] = algo_cache_.getAlgorithm(
686  X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
687  VLOG(1) << "CUDNN Convolution fwd: doing exhaustive "
688  << "search for " << kComputePassNames[i];
689  // When we do an exhaustive search, we will ignore the workspace
690  // size limit and simply go for the fastest algorithm. If you
691  // happen to run out of memory later, you will be on your own...
692  int returned_algo_count;
693  std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
694  fwd_perf_stat;
695 
696  // no need to clean up workspace,
697  cudnn_wrapper_.with_cudnn_state(
698  cudnn_state_, [&](CuDNNState* state) {
699  // Actually run the search.
700  CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
701  state->cudnn_handle(),
702  bottom_desc_,
703  X.template data<T_X>(),
704  filter_desc_,
705  filter.template data<T_W>(),
706  conv_desc_,
707  top_desc_,
708  Y->template mutable_data<T_Y>(),
709  kNUM_CUDNN_FWD_ALGS,
710  &returned_algo_count,
711  fwd_perf_stat.data(),
712  state->workspace().get(cudnn_ws_nbytes_limit_),
713  cudnn_ws_nbytes_limit_));
714  });
715  LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
716  float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
717  ? fwd_perf_stat[0].time
718  : 1e10;
719  return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
720  });
721 
722  // When set to fp32 compute, don't try fp16
723  if (compute_type_ == CUDNN_DATA_FLOAT) {
724  break;
725  }
726  }
727 
728  if (compute_type_ == CUDNN_DATA_FLOAT) {
729  // For FP32 compute, just use the best FP32 algorithm
730  algo_ = std::get<0>(algosToCompare[0]);
731  } else {
732  // For FP16 compute, choose algo with fastest execution
733  int bestAlgoIndex =
734  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
735  ? 0
736  : 1;
737  algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
738  SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
739  }
740  } else {
741  // Get the convolution algorithm based on the workspace limit.
742  CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
743  cudnn_wrapper_.inline_cudnn_handle(),
744  bottom_desc_,
745  filter_desc_,
746  conv_desc_,
747  top_desc_,
748  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
749  cudnn_ws_nbytes_limit_,
750  &algo_));
751  }
752  CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
753  cudnn_wrapper_.inline_cudnn_handle(),
754  bottom_desc_,
755  filter_desc_,
756  conv_desc_,
757  top_desc_,
758  algo_,
759  &cudnn_ws_nbytes_));
760  VLOG(1) << "CuDNN algorithm: " << algo_;
761  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
762  }
763 
764  // Now, actually run the computation.
765  // Run directly through cuDNN if possible
766 #if CUDNN_VERSION_MIN(7,0,0)
767  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
768  CUDNN_ENFORCE(cudnnConvolutionForward(
769  state->cudnn_handle(),
771  bottom_desc_,
772  X.template data<T_X>(),
773  filter_desc_,
774  filter.template data<T_W>(),
775  conv_desc_,
776  algo_,
777  state->workspace().get(cudnn_ws_nbytes_),
778  cudnn_ws_nbytes_,
780  top_desc_,
781  Y->template mutable_data<T_Y>()));
782  });
783 #else
784  // otherwise manually run through groups
785  for (int i = 0; i < group_; ++i) {
786  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
787  CUDNN_ENFORCE(cudnnConvolutionForward(
788  state->cudnn_handle(),
790  bottom_desc_,
791  X.template data<T_X>() + i * group_offset_X,
792  filter_desc_,
793  filter.template data<T_W>() + i * group_offset_filter,
794  conv_desc_,
795  algo_,
796  state->workspace().get(cudnn_ws_nbytes_),
797  cudnn_ws_nbytes_,
799  top_desc_,
800  Y->template mutable_data<T_Y>() + i * group_offset_Y));
801  });
802  }
803 #endif
804  // Bias
805  if (InputSize() == 3) {
806  auto& bias = Input(BIAS);
807 
808  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
809  CAFFE_ENFORCE_EQ(bias.dim32(0), M);
810 
811  CUDNN_ENFORCE(cudnnAddTensor(
812  cudnn_wrapper_.inline_cudnn_handle(),
814  bias_desc_,
815  bias.template data<T_B>(),
817  top_desc_for_bias_,
818  Y->template mutable_data<T_Y>()));
819  }
820  // Done.
821  return true;
822 }
823 
824 bool CudnnConvOp::RunOnDevice() {
825  if (Input(0).IsType<float>()) {
826  return DoRunWithType<
827  float, // X
828  float, // W
829  float, // B
830  float>(); // Y
831  } else if (Input(0).IsType<float16>()) {
832  return DoRunWithType<
833  float16, // X
834  float16, // W
835  float16, // B
836  float16>(); // Y
837  } else {
838  LOG(FATAL) << "Only float (32bit) and float16 are supported by "
839  << "cudnn convolution, but input " << debug_def().input(0)
840  << " has [" << Input(0).meta().name() << "]";
841  }
842  return true;
843 }
844 
845 template <
846  typename T_X,
847  typename T_DY,
848  typename T_W,
849  typename T_B,
850  typename T_DX,
851  typename T_DW,
852  typename T_DB>
853 bool CudnnConvGradientOp::DoRunWithType() {
854  auto& X = Input(INPUT);
855  auto& filter = Input(FILTER);
856  auto& dY = Input(OUTPUT_GRAD);
857  auto* dfilter = Output(FILTER_GRAD);
858 
859  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
860  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
861 
862  const int M = filter.dim32(0);
863  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
864  int group_offset_X = 0, group_offset_Y = 0;
865 
866  switch (order_) {
867  case StorageOrder::NHWC:
868  N = X.dim32(0);
869  H = X.dim32(1);
870  W = X.ndim() > 3 ? X.dim32(2) : 1;
871  D = X.ndim() > 4 ? X.dim32(3) : 1;
872  C = X.dim32(X.ndim() - 1);
873  H_out = dY.dim32(1);
874  W_out = dY.ndim() > 3 ? dY.dim32(2) : 1;
875  D_out = dY.ndim() > 4 ? dY.dim32(3) : 1;
876  for (int i = 0; i < kernel_.size(); ++i) {
877  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
878  }
879  CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
880  group_offset_X = C / group_;
881  group_offset_Y = M / group_;
882  break;
883  case StorageOrder::NCHW:
884  N = X.dim32(0);
885  C = X.dim32(1);
886  H = X.dim32(2);
887  W = X.ndim() > 3 ? X.dim32(3) : 1;
888  D = X.ndim() > 4 ? X.dim32(4) : 1;
889  H_out = dY.dim32(2);
890  W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
891  D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
892  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
893  for (int i = 0; i < kernel_.size(); ++i) {
894  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
895  }
896  group_offset_X = C / group_ * H * W * D;
897  group_offset_Y = M / group_ * H_out * W_out * D_out;
898  break;
899  default:
900  LOG(FATAL) << "Unknown storage order: " << order_;
901  }
902 
903  CAFFE_ENFORCE(
904  C % group_ == 0,
905  "If you set group, the number of input channels should be divisible "
906  "by group.");
907  CAFFE_ENFORCE(
908  M % group_ == 0,
909  "If you set group, the number of output channels should be divisible "
910  "by group.");
911 
912  int group_offset_filter = filter.size() / group_;
913  if (kernel_.size() == 1) {
915  } else if (kernel_.size() == 2) {
917  } else if (kernel_.size() == 3) {
919  } else {
920  CAFFE_THROW("Unsupported kernel size:", kernel_.size());
921  }
922  dfilter->ResizeLike(filter);
923 
924  // Set up the cudnn algorithms & workspace if necessary
925  bool input_changed = (X.dims() != cudnn_input_dims_);
926  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
927  if (input_changed || filter_changed) {
928  VLOG(1) << "Changing the cudnn descriptor configurations.";
929  if (input_changed) {
930  cudnn_input_dims_ = X.dims();
931  SetTensorNdDescriptorWithGroup<T_X>(
932  X.ndim(), bottom_desc_, N, C, H, W, D);
933  }
934  if (filter_changed) {
935  cudnn_filter_dims_ = filter.dims();
936  if (kernel_.size() == 2) {
937 #if CUDNN_VERSION_MIN(7, 0, 0)
938  const int MM = M;
939 #else
940  const int MM = M / group_;
941 #endif
942  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
943  filter_desc_,
945  GetCudnnTensorFormat(order_),
946  MM,
947  C / group_,
948  kernel_h(),
949  kernel_w()));
950  } else {
951  vector<int> dims(filter.dims().begin(), filter.dims().end());
952 #if !CUDNN_VERSION_MIN(7, 0, 0)
953  dims[0] /= group_;
954 #endif
955  order_ == StorageOrder::NCHW ? dims[1] /= group_
956  : dims[filter.ndim() - 1] /= group_;
957  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
958  filter_desc_,
960  GetCudnnTensorFormat(order_),
961  dims.size(),
962  dims.data()));
963  }
964  if (!no_bias_) {
965  if (kernel_.size() == 2) {
966  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
967  bias_desc_,
968  GetCudnnTensorFormat(order_),
970  1,
971  M,
972  1,
973  1));
974  } else {
975  std::vector<int> bias_dims(X.ndim(), 1);
976  bias_dims[1] = M;
977  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
978  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
979  bias_desc_,
981  X.ndim() > 3 ? X.ndim() : 4,
982  bias_dims.data(),
983  strides.data()));
984  }
985  }
986  }
987  // Set the output
988  SetTensorNdDescriptorWithGroup<T_DX>(
989  X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
990  // Set the output with descriptor useful for bias addition in one run.
991  if (kernel_.size() == 2) {
992  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
993  top_desc_for_bias_,
994  GetCudnnTensorFormat(order_),
996  N,
997  M,
998  H_out,
999  W_out));
1000  } else {
1001  vector<int> dims = {N, M, H_out, W_out, D_out};
1002  vector<int> strides = {M * H_out * W_out * D_out,
1003  H_out * W_out * D_out,
1004  W_out * D_out,
1005  D_out,
1006  1};
1007  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1008  top_desc_for_bias_,
1010  X.ndim() > 3 ? X.ndim() : 4,
1011  dims.data(),
1012  strides.data()));
1013  }
1014 
1015  compute_type_ = DetermineComputeTypeFromInput(X);
1016  SetConvDescFromArguments();
1017 
1018  DuplicateConvDesc(
1019  conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
1020  DuplicateConvDesc(
1021  conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
1022 
1023 #if CUDNN_VERSION_MIN(7, 0, 0)
1024  if (enable_tensor_core_) {
1025  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1026  bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
1027  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1028  bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
1029  }
1030 
1031  // set cuDNN groups if appropriate
1032  CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
1033  CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
1034 #endif
1035 
1036  // Choose dW algorithm
1037  if (force_algo_[ALGO_WGRAD] >= 0) {
1038  bwd_filter_algo_ =
1039  (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
1040  } else if (deterministic_) {
1041  bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
1042  } else if (exhaustive_search_) {
1043  // Even when FP16 compute is supported and requested, try FP32
1044  // because it may be faster. However, if FP32 compute is specified,
1045  // FP16 is not a suitable alternative - early out from the loop.
1046  std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
1047  for (int i = 0; i < 2; i++) {
1048  SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
1049 
1050  algosToCompare[i] = filter_algo_cache_.getAlgorithm(
1051  X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
1052  VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive"
1053  << "search for " << kComputePassNames[i];
1054  // When we do an exhaustive search, we will ignore the workspace
1055  // size limit and simply go for the fastest algorithm. If you
1056  // happen to run out of memory later, you will be on your own...
1057  int returned_algo_count;
1058  // We clean up the current workspace memory so that the forward
1059  // algorithm is free to allocate memory.
1060  // Actually run the search.
1061  std::array<
1062  cudnnConvolutionBwdFilterAlgoPerf_t,
1063  kNUM_CUDNN_BWD_FILTER_ALGS>
1064  filter_perf_stat;
1065 
1066  cudnn_wrapper_.with_cudnn_state(
1067  cudnn_state_, [&](CuDNNState* state) {
1068  CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
1069  state->cudnn_handle(),
1070  bottom_desc_,
1071  X.template data<T_X>(),
1072  top_desc_,
1073  dY.template data<T_DY>(),
1074  bwd_filter_conv_desc_,
1075  filter_desc_,
1076  dfilter->template mutable_data<T_DW>(),
1077  kNUM_CUDNN_BWD_FILTER_ALGS,
1078  &returned_algo_count,
1079  filter_perf_stat.data(),
1080  state->workspace().get(cudnn_ws_nbytes_limit_),
1081  cudnn_ws_nbytes_limit_));
1082  });
1083  LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
1084  float algo_time =
1085  filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1086  ? filter_perf_stat[0].time
1087  : 1e10;
1088  return ConvBwdFilterAlgorithmWithCost(
1089  filter_perf_stat[0].algo, algo_time);
1090  });
1091 
1092  // When set to fp32 compute, don't try fp16
1093  if (compute_type_ == CUDNN_DATA_FLOAT) {
1094  break;
1095  }
1096  }
1097 
1098  if (compute_type_ == CUDNN_DATA_FLOAT) {
1099  // For FP32 compute, just use the best FP32 algorithm
1100  bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
1101  } else {
1102  // For FP16 compute, choose algo with fastest execution
1103  int bestAlgoIndex =
1104  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1105  ? 0
1106  : 1;
1107  bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1108  SetConvDescComputeType(
1109  bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1110  }
1111  } else {
1112  // choose backward algorithm for filter
1113  CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
1114  cudnn_wrapper_.inline_cudnn_handle(),
1115  bottom_desc_,
1116  top_desc_,
1117  bwd_filter_conv_desc_,
1118  filter_desc_,
1119  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
1120  cudnn_ws_nbytes_limit_,
1121  &bwd_filter_algo_));
1122  }
1123  // Pick dX algo if needed
1124  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1125  if (force_algo_[ALGO_DGRAD] >= 0) {
1126  bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
1127  } else if (deterministic_) {
1128  bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1129  } else if (exhaustive_search_) {
1130  // Even when FP16 compute is supported and requested, try FP32
1131  // because it may be faster. However, if FP32 compute is specified,
1132  // FP16 is not a suitable alternative - early out from the loop.
1133  std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
1134  for (int i = 0; i < 2; i++) {
1135  SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
1136 
1137  algosToCompare[i] = data_algo_cache_.getAlgorithm(
1138  X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
1139  VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive"
1140  << "search for " << kComputePassNames[i];
1141  int returned_algo_count;
1142 
1143  std::array<
1144  cudnnConvolutionBwdDataAlgoPerf_t,
1145  kNUM_CUDNN_BWD_DATA_ALGS>
1146  data_perf_stat;
1147  cudnn_wrapper_.with_cudnn_state(
1148  cudnn_state_, [&](CuDNNState* state) {
1149  auto* dX =
1150  Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1151  dX->ResizeLike(X);
1152  const T_W* filter_data = filter.template data<T_W>();
1153  const T_DY* dYdata = dY.template data<T_DY>();
1154  T_DX* dXdata = dX->template mutable_data<T_DX>();
1155  CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
1156  state->cudnn_handle(),
1157  filter_desc_,
1158  filter_data,
1159  top_desc_,
1160  dYdata,
1161  bwd_data_conv_desc_,
1162  bottom_desc_,
1163  dXdata,
1164  kNUM_CUDNN_BWD_DATA_ALGS,
1165  &returned_algo_count,
1166  data_perf_stat.data(),
1167  state->workspace().get(cudnn_ws_nbytes_limit_),
1168  cudnn_ws_nbytes_limit_));
1169  });
1170 
1171  LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
1172  float algo_time =
1173  data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1174  ? data_perf_stat[0].time
1175  : 1e10;
1176  return ConvBwdDataAlgorithmWithCost(
1177  data_perf_stat[0].algo, algo_time);
1178  });
1179 
1180  // When set to fp32 compute, don't try fp16
1181  if (compute_type_ == CUDNN_DATA_FLOAT) {
1182  break;
1183  }
1184  }
1185 
1186  if (compute_type_ == CUDNN_DATA_FLOAT) {
1187  // For FP32 compute, just use the best FP32 algorithm
1188  bwd_data_algo_ = std::get<0>(algosToCompare[0]);
1189  } else {
1190  // For FP16 compute, choose algo with fastest execution
1191  int bestAlgoIndex =
1192  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1193  ? 0
1194  : 1;
1195  bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1196  SetConvDescComputeType(
1197  bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1198  }
1199  } else {
1200  CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
1201  cudnn_wrapper_.inline_cudnn_handle(),
1202  filter_desc_,
1203  top_desc_,
1204  bwd_data_conv_desc_,
1205  bottom_desc_,
1206  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
1207  cudnn_ws_nbytes_limit_,
1208  &bwd_data_algo_));
1209  }
1210  }
1211 
1212  // get workspace size for backwards filter algorithm
1213  size_t bwd_filter_ws_size, bwd_data_ws_size;
1214 
1215  CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
1216  cudnn_wrapper_.inline_cudnn_handle(),
1217  bottom_desc_,
1218  top_desc_,
1219  bwd_filter_conv_desc_,
1220  filter_desc_,
1221  bwd_filter_algo_,
1222  &bwd_filter_ws_size));
1223  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1224  // get workspace size for backwards data algorithm
1225  CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
1226  cudnn_wrapper_.inline_cudnn_handle(),
1227  filter_desc_,
1228  top_desc_,
1229  bwd_data_conv_desc_,
1230  bottom_desc_,
1231  bwd_data_algo_,
1232  &bwd_data_ws_size));
1233  } else {
1234  bwd_data_ws_size = 0;
1235  }
1236  cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
1237 
1238  VLOG(1) << "CuDNN bwd data & filter algorithm: " << bwd_data_algo_ << ", "
1239  << bwd_filter_algo_;
1240  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
1241  }
1242 
1243  // Now, actually run the computation.
1244  if (!no_bias_) {
1245  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
1246  dbias->Resize(M);
1247  CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
1248  cudnn_wrapper_.inline_cudnn_handle(),
1250  top_desc_for_bias_,
1251  dY.template data<T_DY>(),
1253  bias_desc_,
1254  dbias->template mutable_data<T_DB>()));
1255  }
1256 
1257 #if CUDNN_VERSION_MIN(7, 0, 0)
1258  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1259  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1260  state->cudnn_handle(),
1262  bottom_desc_,
1263  X.template data<T_X>(),
1264  top_desc_,
1265  dY.template data<T_DY>(),
1266  bwd_filter_conv_desc_,
1267  bwd_filter_algo_,
1268  state->workspace().get(cudnn_ws_nbytes_),
1269  cudnn_ws_nbytes_,
1271  filter_desc_,
1272  dfilter->template mutable_data<T_DW>()));
1273  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1274  // Compute the gradient w.r.t. the input.
1275  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1276  dX->ResizeLike(X);
1277  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1278  state->cudnn_handle(),
1280  filter_desc_,
1281  filter.template data<T_W>(),
1282  top_desc_,
1283  dY.template data<T_DY>(),
1284  bwd_data_conv_desc_,
1285  bwd_data_algo_,
1286  state->workspace().get(cudnn_ws_nbytes_),
1287  cudnn_ws_nbytes_,
1289  bottom_desc_,
1290  dX->template mutable_data<T_DX>()));
1291  }
1292  });
1293 #else
1294  for (int i = 0; i < group_; ++i) {
1295  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1296  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1297  state->cudnn_handle(),
1299  bottom_desc_,
1300  X.template data<T_X>() + i * group_offset_X,
1301  top_desc_,
1302  dY.template data<T_DY>() + i * group_offset_Y,
1303  bwd_filter_conv_desc_,
1304  bwd_filter_algo_,
1305  state->workspace().get(cudnn_ws_nbytes_),
1306  cudnn_ws_nbytes_,
1308  filter_desc_,
1309  dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
1310  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1311  // Compute the gradient w.r.t. the input.
1312  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1313  dX->ResizeLike(X);
1314  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1315  state->cudnn_handle(),
1317  filter_desc_,
1318  filter.template data<T_W>() + i * group_offset_filter,
1319  top_desc_,
1320  dY.template data<T_DY>() + i * group_offset_Y,
1321  bwd_data_conv_desc_,
1322  bwd_data_algo_,
1323  state->workspace().get(cudnn_ws_nbytes_),
1324  cudnn_ws_nbytes_,
1326  bottom_desc_,
1327  dX->template mutable_data<T_DX>() + i * group_offset_X));
1328  }
1329  });
1330  }
1331 #endif
1332  return true;
1333 }
1334 
1335 // TODO(Yangqing): a lot of the function contents are very similar. Consider
1336 // consolidating them.
1337 bool CudnnConvGradientOp::RunOnDevice() {
1338  if (Input(0).IsType<float>()) {
1339  return DoRunWithType<
1340  float, // X
1341  float, // dY
1342  float, // W
1343  float, // b
1344  float, // dX
1345  float, // dW
1346  float>(); // db
1347  } else if (Input(0).IsType<float16>()) {
1348  return DoRunWithType<
1349  float16, // X
1350  float16, // dY
1351  float16, // W
1352  float16, // b
1353  float16, // dX
1354  float16, // dW
1355  float16>(); // db
1356  } else {
1357  LOG(FATAL) << "Unsupported input types";
1358  }
1359  return true;
1360 }
1361 
1362 REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp);
1363 REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp);
1364 
1365 REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp);
1366 REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp);
1367 
1368 REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp);
1369 REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp);
1370 
1371 REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp);
1372 REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp);
1373 
1374 } // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:183
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:166
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
Definition: operator.h:37
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:238
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:111