1 #include "caffe2/core/context_gpu.h" 3 #include "caffe2/core/common_gpu.h" 4 #include "caffe2/core/cudnn_wrappers.h" 5 #include "caffe2/operators/conv_op.h" 6 #include "caffe2/operators/conv_op_cache_cudnn.h" 7 #include "caffe2/operators/conv_pool_op_base.h" 8 #include "caffe2/operators/op_utils_cudnn.h" 16 cudnn_wrapper_(&context_),
17 cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
19 kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
21 OperatorBase::GetSingleArgument<int>(
"exhaustive_search", 0)),
23 OperatorBase::GetSingleArgument<int>(
"deterministic", 0)),
24 cudnn_state_(OperatorBase::GetSingleArgument<int>(
"cudnn_state", 0)),
25 force_algo_(OperatorBase::GetRepeatedArgument<int>(
"force_algo", vector<int>{-1,-1,-1})),
26 enable_tensor_core_(OperatorBase::GetSingleArgument<bool>(
"enable_tensor_core", 1)) {
27 CHECK(!deterministic_ || !exhaustive_search_);
28 CAFFE_ENFORCE(group_ > 0);
29 CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
30 for (
int i = 0; i < kernel_.size(); ++i) {
31 OPERATOR_NEEDS_FEATURE(
32 pads_[i] == pads_[kernel_.size() + i],
33 "The current padding scheme leads to unequal padding on the left " 34 "and right, which is not supported by cudnn.");
37 #if !(CUDNN_VERSION_MIN(6,0,0)) 38 OPERATOR_NEEDS_FEATURE(
39 dilation_h() == 1 && dilation_w() == 1,
40 "The cudnn convolution does not support dilation yet.");
43 #if CUDNN_VERSION_MIN(7, 0, 0) 47 enable_tensor_core_ =
false;
54 CAFFE_ENFORCE(!individual_force_algo,
55 "Cannot specify both force_algo and any of",
56 "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
58 force_algo_ = std::vector<int>{-1,-1,-1};
59 force_algo_[ALGO_FWD] =
60 OperatorBase::GetSingleArgument<int>(
"force_algo_fwd", -1);
61 force_algo_[ALGO_DGRAD] =
62 OperatorBase::GetSingleArgument<int>(
"force_algo_dgrad", -1);
63 force_algo_[ALGO_WGRAD] =
64 OperatorBase::GetSingleArgument<int>(
"force_algo_wgrad", -1);
67 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
68 CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
69 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
70 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
71 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
72 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
76 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
77 CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
78 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
79 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
80 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
81 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
88 void SetTensorNdDescriptorWithGroup(
90 cudnnTensorDescriptor_t tensorDesc,
96 #if CUDNN_VERSION_MIN(7, 0, 0) 99 const int CC = C / group_;
102 case StorageOrder::NHWC:
104 CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
116 vector<int> dims = {N, H, W, D, CC};
117 vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
118 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
126 case StorageOrder::NCHW:
128 CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
140 vector<int> dims = {N, CC, H, W, D};
141 vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
142 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
151 LOG(FATAL) <<
"Unknown storage order: " << order_;
155 void DuplicateConvDesc(
156 cudnnConvolutionDescriptor_t input,
159 cudnnConvolutionDescriptor_t copy) {
160 if (kernelDims == 2) {
161 cudnnConvolutionMode_t mode;
162 cudnnDataType_t dataType;
165 int stride_height = 0;
166 int stride_width = 0;
167 int dilation_height = 0;
168 int dilation_width = 0;
170 #if CUDNN_VERSION_MIN(6, 0, 0) 171 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
183 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
195 #if CUDNN_VERSION_MIN(6, 0, 0) 196 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
208 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
220 cudnnConvolutionMode_t mode;
221 cudnnDataType_t dataType;
223 vector<int> ones(dilationDims, 1);
224 CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
234 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
245 template <
typename T>
246 cudnnDataType_t DetermineComputeTypeFromInput(
const T& X) {
248 cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
249 if (X.template IsType<float16>()) {
250 if (float16_compute_ && prop.major >= 6) {
251 VLOG(1) <<
"CUDNN Convolution: float16_compute specified and " 252 <<
"supported, input data is float16 - using float16 " 254 computeType = CUDNN_DATA_HALF;
255 }
else if (float16_compute_) {
256 VLOG(1) <<
"CUDNN Convolution: float16_compute specified but" 257 <<
"not supported, input data is float16 - using float32 " 260 VLOG(1) <<
"CUDNN Convolution: float16_compute not specified but " 261 <<
"input data is float16 - using float32 compute.";
264 VLOG(1) <<
"CUDNN Convolution: using float32 compute.";
269 void SetConvDescFromArguments() {
270 #if CUDNN_VERSION_MIN(6, 0, 0) 271 if (kernel_.size() == 2) {
272 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
280 CUDNN_CROSS_CORRELATION,
283 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
289 CUDNN_CROSS_CORRELATION,
293 if (kernel_.size() == 2) {
294 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
302 CUDNN_CROSS_CORRELATION));
304 vector<int> ones(dilation_.size(), 1);
305 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
311 CUDNN_CROSS_CORRELATION,
317 void SetConvDescComputeType(
318 cudnnConvolutionDescriptor_t conv_desc,
319 cudnnDataType_t math) {
320 if (kernel_.size() == 2) {
321 cudnnConvolutionMode_t mode;
322 cudnnDataType_t dataType;
325 int stride_height = 0;
326 int stride_width = 0;
327 int dilation_height = 0;
328 int dilation_width = 0;
330 #if CUDNN_VERSION_MIN(6, 0, 0) 331 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
343 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
355 #if CUDNN_VERSION_MIN(6, 0, 0) 356 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
368 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
380 cudnnConvolutionMode_t mode;
381 cudnnDataType_t dataType;
383 vector<int> ones(dilation_.size(), 1);
384 CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
394 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
405 vector<TIndex> cudnn_input_dims_;
406 vector<TIndex> cudnn_filter_dims_;
409 cudnnTensorDescriptor_t bottom_desc_;
410 cudnnFilterDescriptor_t filter_desc_;
411 cudnnTensorDescriptor_t bias_desc_;
412 cudnnTensorDescriptor_t top_desc_;
414 cudnnTensorDescriptor_t top_desc_for_bias_;
415 cudnnConvolutionDescriptor_t conv_desc_;
416 const size_t cudnn_ws_nbytes_limit_;
417 size_t cudnn_ws_nbytes_;
418 bool exhaustive_search_;
421 vector<int> force_algo_;
422 bool enable_tensor_core_;
423 cudnnDataType_t compute_type_;
433 template <
typename T_X,
typename T_W,
typename T_B,
typename T_Y>
434 bool DoRunWithType();
436 bool RunOnDevice()
override;
439 cudnnConvolutionFwdAlgo_t algo_;
440 using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
444 INPUT_TAGS(INPUT, FILTER, BIAS);
451 no_bias_(OperatorBase::GetSingleArgument<int>(
"no_bias", 0)) {
453 !(no_bias_ && OutputSize() == 3),
454 "If bias is not present, you should not have 3 grad output.");
456 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
457 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
461 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
462 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
473 bool DoRunWithType();
475 bool RunOnDevice()
override;
478 cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
479 cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
480 cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
481 cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
482 using ConvBwdFilterAlgorithmWithCost =
483 std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
484 using ConvBwdDataAlgorithmWithCost =
485 std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
491 INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
492 OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
499 static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
502 static constexpr std::array<const char*, 2> kComputePassNames = {
506 template <
typename T_X,
typename T_W,
typename T_B,
typename T_Y>
507 bool CudnnConvOp::DoRunWithType() {
508 auto& X = Input(INPUT);
509 auto& filter = Input(FILTER);
513 CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
514 CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
515 const int M = filter.dim32(0);
517 int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
518 int group_offset_X = 0, group_offset_Y = 0;
521 case StorageOrder::NHWC:
524 W = X.ndim() > 3 ? X.dim32(2) : 1;
525 D = X.ndim() > 4 ? X.dim32(3) : 1;
526 C = X.dim32(X.ndim() - 1);
528 W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
529 D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
530 for (
int i = 0; i < kernel_.size(); ++i) {
531 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
533 CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
534 group_offset_X = C / group_;
535 group_offset_Y = M / group_;
537 case StorageOrder::NCHW:
541 W = X.ndim() > 3 ? X.dim32(3) : 1;
542 D = X.ndim() > 4 ? X.dim32(4) : 1;
544 W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
545 D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
546 CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
547 for (
int i = 0; i < kernel_.size(); ++i) {
548 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
550 group_offset_X = C / group_ * H * W * D;
551 group_offset_Y = M / group_ * H_out * W_out * D_out;
554 LOG(FATAL) <<
"Unknown storage order: " << order_;
559 "If you set group, the number of input channels should be divisible " 563 "If you set group, the number of output channels should be divisible " 566 int group_offset_filter = filter.size() / group_;
569 bool input_changed = (X.dims() != cudnn_input_dims_);
570 bool filter_changed = (filter.dims() != cudnn_filter_dims_);
571 if (input_changed || filter_changed) {
572 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
574 cudnn_input_dims_ = X.dims();
575 SetTensorNdDescriptorWithGroup<T_X>(
576 X.ndim(), bottom_desc_, N, C, H, W, D);
578 if (filter_changed) {
579 cudnn_filter_dims_ = filter.dims();
580 if (kernel_.size() == 2) {
581 #if CUDNN_VERSION_MIN(7, 0, 0) 584 const int MM = M / group_;
586 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
595 vector<int> dims(filter.dims().begin(), filter.dims().end());
597 #if !CUDNN_VERSION_MIN(7, 0, 0) 598 order_ == StorageOrder::NCHW ? dims[1] /= group_
599 : dims[filter.ndim() - 1] /= group_;
601 dims[filter.ndim() - 1] /= group_;
602 CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
609 if (InputSize() == 3) {
610 if (kernel_.size() == 2) {
611 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
620 std::vector<int> bias_dims(X.ndim(), 1);
622 std::vector<int> strides = {M, 1, 1, 1, 1, 1};
623 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
626 X.ndim() > 3 ? X.ndim() : 4,
633 SetTensorNdDescriptorWithGroup<T_Y>(
634 X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
636 if (kernel_.size() == 2) {
637 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
646 vector<int> dims = {N, M, H_out, W_out, D_out};
647 vector<int> strides = {M * H_out * W_out * D_out,
648 H_out * W_out * D_out,
652 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
655 X.ndim() > 3 ? X.ndim() : 4,
660 compute_type_ = DetermineComputeTypeFromInput(X);
661 SetConvDescFromArguments();
663 #if CUDNN_VERSION_MIN(7, 0, 0) 664 if (enable_tensor_core_) {
666 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
670 CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
673 if (force_algo_[ALGO_FWD] >= 0) {
674 algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
675 }
else if (deterministic_) {
676 algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
677 }
else if (exhaustive_search_) {
681 std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
682 for (
int i = 0; i < 2; i++) {
683 SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
685 algosToCompare[i] = algo_cache_.getAlgorithm(
686 X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
687 VLOG(1) <<
"CUDNN Convolution fwd: doing exhaustive " 688 <<
"search for " << kComputePassNames[i];
692 int returned_algo_count;
693 std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
697 cudnn_wrapper_.with_cudnn_state(
700 CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
701 state->cudnn_handle(),
703 X.template data<T_X>(),
705 filter.template data<T_W>(),
708 Y->template mutable_data<T_Y>(),
710 &returned_algo_count,
711 fwd_perf_stat.data(),
712 state->workspace().get(cudnn_ws_nbytes_limit_),
713 cudnn_ws_nbytes_limit_));
715 LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
716 float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
717 ? fwd_perf_stat[0].time
719 return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
723 if (compute_type_ == CUDNN_DATA_FLOAT) {
728 if (compute_type_ == CUDNN_DATA_FLOAT) {
730 algo_ = std::get<0>(algosToCompare[0]);
734 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
737 algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
738 SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
742 CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
743 cudnn_wrapper_.inline_cudnn_handle(),
748 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
749 cudnn_ws_nbytes_limit_,
752 CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
753 cudnn_wrapper_.inline_cudnn_handle(),
760 VLOG(1) <<
"CuDNN algorithm: " << algo_;
761 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
766 #if CUDNN_VERSION_MIN(7,0,0) 767 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
768 CUDNN_ENFORCE(cudnnConvolutionForward(
769 state->cudnn_handle(),
772 X.template data<T_X>(),
774 filter.template data<T_W>(),
777 state->workspace().get(cudnn_ws_nbytes_),
781 Y->template mutable_data<T_Y>()));
785 for (
int i = 0; i < group_; ++i) {
786 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
787 CUDNN_ENFORCE(cudnnConvolutionForward(
788 state->cudnn_handle(),
791 X.template data<T_X>() + i * group_offset_X,
793 filter.template data<T_W>() + i * group_offset_filter,
796 state->workspace().get(cudnn_ws_nbytes_),
800 Y->template mutable_data<T_Y>() + i * group_offset_Y));
805 if (InputSize() == 3) {
806 auto& bias = Input(BIAS);
808 CAFFE_ENFORCE_EQ(bias.ndim(), 1);
809 CAFFE_ENFORCE_EQ(bias.dim32(0), M);
811 CUDNN_ENFORCE(cudnnAddTensor(
812 cudnn_wrapper_.inline_cudnn_handle(),
815 bias.template data<T_B>(),
818 Y->template mutable_data<T_Y>()));
824 bool CudnnConvOp::RunOnDevice() {
825 if (Input(0).IsType<float>()) {
826 return DoRunWithType<
831 }
else if (Input(0).IsType<float16>()) {
832 return DoRunWithType<
838 LOG(FATAL) <<
"Only float (32bit) and float16 are supported by " 839 <<
"cudnn convolution, but input " << debug_def().input(0)
840 <<
" has [" << Input(0).meta().name() <<
"]";
853 bool CudnnConvGradientOp::DoRunWithType() {
854 auto& X = Input(INPUT);
855 auto& filter = Input(FILTER);
856 auto& dY = Input(OUTPUT_GRAD);
857 auto* dfilter = Output(FILTER_GRAD);
859 CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
860 CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
862 const int M = filter.dim32(0);
863 int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
864 int group_offset_X = 0, group_offset_Y = 0;
867 case StorageOrder::NHWC:
870 W = X.ndim() > 3 ? X.dim32(2) : 1;
871 D = X.ndim() > 4 ? X.dim32(3) : 1;
872 C = X.dim32(X.ndim() - 1);
874 W_out = dY.ndim() > 3 ? dY.dim32(2) : 1;
875 D_out = dY.ndim() > 4 ? dY.dim32(3) : 1;
876 for (
int i = 0; i < kernel_.size(); ++i) {
877 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
879 CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
880 group_offset_X = C / group_;
881 group_offset_Y = M / group_;
883 case StorageOrder::NCHW:
887 W = X.ndim() > 3 ? X.dim32(3) : 1;
888 D = X.ndim() > 4 ? X.dim32(4) : 1;
890 W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
891 D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
892 CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
893 for (
int i = 0; i < kernel_.size(); ++i) {
894 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
896 group_offset_X = C / group_ * H * W * D;
897 group_offset_Y = M / group_ * H_out * W_out * D_out;
900 LOG(FATAL) <<
"Unknown storage order: " << order_;
905 "If you set group, the number of input channels should be divisible " 909 "If you set group, the number of output channels should be divisible " 912 int group_offset_filter = filter.size() / group_;
913 if (kernel_.size() == 1) {
915 }
else if (kernel_.size() == 2) {
917 }
else if (kernel_.size() == 3) {
920 CAFFE_THROW(
"Unsupported kernel size:", kernel_.size());
922 dfilter->ResizeLike(filter);
925 bool input_changed = (X.dims() != cudnn_input_dims_);
926 bool filter_changed = (filter.dims() != cudnn_filter_dims_);
927 if (input_changed || filter_changed) {
928 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
930 cudnn_input_dims_ = X.dims();
931 SetTensorNdDescriptorWithGroup<T_X>(
932 X.ndim(), bottom_desc_, N, C, H, W, D);
934 if (filter_changed) {
935 cudnn_filter_dims_ = filter.dims();
936 if (kernel_.size() == 2) {
937 #if CUDNN_VERSION_MIN(7, 0, 0) 940 const int MM = M / group_;
942 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
951 vector<int> dims(filter.dims().begin(), filter.dims().end());
952 #if !CUDNN_VERSION_MIN(7, 0, 0) 955 order_ == StorageOrder::NCHW ? dims[1] /= group_
956 : dims[filter.ndim() - 1] /= group_;
957 CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
965 if (kernel_.size() == 2) {
966 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
975 std::vector<int> bias_dims(X.ndim(), 1);
977 std::vector<int> strides = {M, 1, 1, 1, 1, 1};
978 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
981 X.ndim() > 3 ? X.ndim() : 4,
988 SetTensorNdDescriptorWithGroup<T_DX>(
989 X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
991 if (kernel_.size() == 2) {
992 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
1001 vector<int> dims = {N, M, H_out, W_out, D_out};
1002 vector<int> strides = {M * H_out * W_out * D_out,
1003 H_out * W_out * D_out,
1007 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1010 X.ndim() > 3 ? X.ndim() : 4,
1015 compute_type_ = DetermineComputeTypeFromInput(X);
1016 SetConvDescFromArguments();
1019 conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
1021 conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
1023 #if CUDNN_VERSION_MIN(7, 0, 0) 1024 if (enable_tensor_core_) {
1025 CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1026 bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
1027 CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1028 bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
1032 CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
1033 CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
1037 if (force_algo_[ALGO_WGRAD] >= 0) {
1039 (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
1040 }
else if (deterministic_) {
1041 bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
1042 }
else if (exhaustive_search_) {
1046 std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
1047 for (
int i = 0; i < 2; i++) {
1048 SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
1050 algosToCompare[i] = filter_algo_cache_.getAlgorithm(
1051 X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
1052 VLOG(1) <<
"CUDNN Convolution bwd: doing filter exhaustive" 1053 <<
"search for " << kComputePassNames[i];
1057 int returned_algo_count;
1062 cudnnConvolutionBwdFilterAlgoPerf_t,
1063 kNUM_CUDNN_BWD_FILTER_ALGS>
1066 cudnn_wrapper_.with_cudnn_state(
1068 CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
1069 state->cudnn_handle(),
1071 X.template data<T_X>(),
1073 dY.template data<T_DY>(),
1074 bwd_filter_conv_desc_,
1076 dfilter->template mutable_data<T_DW>(),
1077 kNUM_CUDNN_BWD_FILTER_ALGS,
1078 &returned_algo_count,
1079 filter_perf_stat.data(),
1080 state->workspace().get(cudnn_ws_nbytes_limit_),
1081 cudnn_ws_nbytes_limit_));
1083 LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
1085 filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1086 ? filter_perf_stat[0].time
1088 return ConvBwdFilterAlgorithmWithCost(
1089 filter_perf_stat[0].algo, algo_time);
1093 if (compute_type_ == CUDNN_DATA_FLOAT) {
1098 if (compute_type_ == CUDNN_DATA_FLOAT) {
1100 bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
1104 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1107 bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1108 SetConvDescComputeType(
1109 bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1113 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
1114 cudnn_wrapper_.inline_cudnn_handle(),
1117 bwd_filter_conv_desc_,
1119 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
1120 cudnn_ws_nbytes_limit_,
1121 &bwd_filter_algo_));
1124 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1125 if (force_algo_[ALGO_DGRAD] >= 0) {
1126 bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
1127 }
else if (deterministic_) {
1128 bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1129 }
else if (exhaustive_search_) {
1133 std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
1134 for (
int i = 0; i < 2; i++) {
1135 SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
1137 algosToCompare[i] = data_algo_cache_.getAlgorithm(
1138 X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
1139 VLOG(1) <<
"CUDNN Convolution bwd: doing data exhaustive" 1140 <<
"search for " << kComputePassNames[i];
1141 int returned_algo_count;
1144 cudnnConvolutionBwdDataAlgoPerf_t,
1145 kNUM_CUDNN_BWD_DATA_ALGS>
1147 cudnn_wrapper_.with_cudnn_state(
1150 Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1152 const T_W* filter_data = filter.template data<T_W>();
1153 const T_DY* dYdata = dY.template data<T_DY>();
1154 T_DX* dXdata = dX->template mutable_data<T_DX>();
1155 CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
1156 state->cudnn_handle(),
1161 bwd_data_conv_desc_,
1164 kNUM_CUDNN_BWD_DATA_ALGS,
1165 &returned_algo_count,
1166 data_perf_stat.data(),
1167 state->workspace().get(cudnn_ws_nbytes_limit_),
1168 cudnn_ws_nbytes_limit_));
1171 LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
1173 data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1174 ? data_perf_stat[0].time
1176 return ConvBwdDataAlgorithmWithCost(
1177 data_perf_stat[0].algo, algo_time);
1181 if (compute_type_ == CUDNN_DATA_FLOAT) {
1186 if (compute_type_ == CUDNN_DATA_FLOAT) {
1188 bwd_data_algo_ = std::get<0>(algosToCompare[0]);
1192 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1195 bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1196 SetConvDescComputeType(
1197 bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1200 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
1201 cudnn_wrapper_.inline_cudnn_handle(),
1204 bwd_data_conv_desc_,
1206 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
1207 cudnn_ws_nbytes_limit_,
1213 size_t bwd_filter_ws_size, bwd_data_ws_size;
1215 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
1216 cudnn_wrapper_.inline_cudnn_handle(),
1219 bwd_filter_conv_desc_,
1222 &bwd_filter_ws_size));
1223 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1225 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
1226 cudnn_wrapper_.inline_cudnn_handle(),
1229 bwd_data_conv_desc_,
1232 &bwd_data_ws_size));
1234 bwd_data_ws_size = 0;
1236 cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
1238 VLOG(1) <<
"CuDNN bwd data & filter algorithm: " << bwd_data_algo_ <<
", " 1239 << bwd_filter_algo_;
1240 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
1245 auto* dbias = Output(BIAS_OR_INPUT_GRAD);
1247 CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
1248 cudnn_wrapper_.inline_cudnn_handle(),
1251 dY.template data<T_DY>(),
1254 dbias->template mutable_data<T_DB>()));
1257 #if CUDNN_VERSION_MIN(7, 0, 0) 1258 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
1259 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1260 state->cudnn_handle(),
1263 X.template data<T_X>(),
1265 dY.template data<T_DY>(),
1266 bwd_filter_conv_desc_,
1268 state->workspace().get(cudnn_ws_nbytes_),
1272 dfilter->template mutable_data<T_DW>()));
1273 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1275 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1277 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1278 state->cudnn_handle(),
1281 filter.template data<T_W>(),
1283 dY.template data<T_DY>(),
1284 bwd_data_conv_desc_,
1286 state->workspace().get(cudnn_ws_nbytes_),
1290 dX->template mutable_data<T_DX>()));
1294 for (
int i = 0; i < group_; ++i) {
1295 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
1296 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1297 state->cudnn_handle(),
1300 X.template data<T_X>() + i * group_offset_X,
1302 dY.template data<T_DY>() + i * group_offset_Y,
1303 bwd_filter_conv_desc_,
1305 state->workspace().get(cudnn_ws_nbytes_),
1309 dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
1310 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1312 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1314 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1315 state->cudnn_handle(),
1318 filter.template data<T_W>() + i * group_offset_filter,
1320 dY.template data<T_DY>() + i * group_offset_Y,
1321 bwd_data_conv_desc_,
1323 state->workspace().get(cudnn_ws_nbytes_),
1327 dX->template mutable_data<T_DX>() + i * group_offset_X));
1337 bool CudnnConvGradientOp::RunOnDevice() {
1338 if (Input(0).IsType<float>()) {
1339 return DoRunWithType<
1347 }
else if (Input(0).IsType<float16>()) {
1348 return DoRunWithType<
1357 LOG(FATAL) <<
"Unsupported input types";
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...