1 #include "caffe2/core/context_gpu.h" 2 #include "caffe2/core/cudnn_wrappers.h" 3 #include "caffe2/operators/conv_op_cache_cudnn.h" 4 #include "caffe2/operators/conv_transpose_op.h" 5 #include "caffe2/operators/op_utils_cudnn.h" 13 cudnn_wrapper_(&context_),
14 cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
16 kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
18 OperatorBase::GetSingleArgument<int>(
"exhaustive_search", 0)),
20 OperatorBase::GetSingleArgument<int>(
"deterministic", 0)),
21 cudnn_state_(OperatorBase::GetSingleArgument<int>(
"cudnn_state", 0)),
22 force_algo_(OperatorBase::GetRepeatedArgument<int>(
24 vector<int>{-1, -1, -1})),
26 OperatorBase::GetSingleArgument<bool>(
"enable_tensor_core", 1)) {
27 CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
34 !individual_force_algo,
35 "Cannot specify both force_algo and any of",
36 "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
38 force_algo_ = std::vector<int>{-1, -1, -1};
39 force_algo_[ALGO_FWD] =
40 OperatorBase::GetSingleArgument<int>(
"force_algo_fwd", -1);
41 force_algo_[ALGO_DGRAD] =
42 OperatorBase::GetSingleArgument<int>(
"force_algo_dgrad", -1);
43 force_algo_[ALGO_WGRAD] =
44 OperatorBase::GetSingleArgument<int>(
"force_algo_wgrad", -1);
47 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
48 CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
49 if (InputSize() == 3) {
50 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
52 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
53 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
57 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
58 CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
59 if (InputSize() == 3) {
60 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
62 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
63 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
67 vector<TIndex> cudnn_input_dims_;
68 vector<TIndex> cudnn_filter_dims_;
71 cudnnTensorDescriptor_t bottom_desc_;
72 cudnnFilterDescriptor_t filter_desc_;
73 cudnnTensorDescriptor_t bias_desc_;
74 cudnnTensorDescriptor_t top_desc_;
75 cudnnConvolutionDescriptor_t conv_desc_;
76 const size_t cudnn_ws_nbytes_limit_;
77 size_t cudnn_ws_nbytes_;
78 bool exhaustive_search_;
81 vector<int> force_algo_;
82 bool enable_tensor_core_;
93 bool RunOnDevice()
override;
97 cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
100 INPUT_TAGS(INPUT, FILTER, BIAS);
103 template <
typename T>
108 no_bias_(OperatorBase::GetSingleArgument<bool>(
"no_bias",
false)) {
110 !(no_bias_ && OutputSize() == 3),
111 "If bias is not present, you should not have 3 grad output.");
116 bool RunOnDevice()
override;
119 cudnnConvolutionFwdAlgo_t algo_;
120 cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
126 INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
127 OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
134 template <
typename T>
136 auto& X = Input(INPUT);
137 auto& filter = Input(FILTER);
141 case StorageOrder::NHWC:
144 case StorageOrder::NCHW:
148 LOG(FATAL) <<
"Unknown storage order: " << order_;
152 int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
154 case StorageOrder::NHWC:
161 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
162 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
163 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
164 CAFFE_ENFORCE_EQ(filter.dim32(3), C);
166 case StorageOrder::NCHW:
173 CAFFE_ENFORCE_EQ(filter.dim32(1), C);
174 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
175 CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
178 LOG(FATAL) <<
"Unknown storage order: " << order_;
181 if (InputSize() == 3) {
182 auto& bias = Input(BIAS);
183 CAFFE_ENFORCE_EQ(bias.ndim(), 1);
184 CAFFE_ENFORCE_EQ(bias.dim32(0), C);
188 bool input_changed = (X.dims() != cudnn_input_dims_);
189 bool filter_changed = (filter.dims() != cudnn_filter_dims_);
191 if (input_changed || filter_changed) {
192 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
194 cudnn_input_dims_ = X.dims();
195 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
204 if (filter_changed) {
205 cudnn_filter_dims_ = filter.dims();
206 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
214 if (InputSize() == 3) {
215 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
226 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
238 "The current padding scheme leads to unequal padding on the top and " 239 "bottom, which is not supported by cudnn.");
243 "The current padding scheme leads to unequal padding on the left " 244 "and right, which is not supported by cudnn.");
246 #if CUDNN_VERSION_MIN(6,0,0) 247 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
255 CUDNN_CROSS_CORRELATION,
258 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
266 CUDNN_CROSS_CORRELATION));
268 #if CUDNN_VERSION_MIN(7, 0, 0) 271 if (enable_tensor_core_) {
273 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
276 if (force_algo_[ALGO_DGRAD] >= 0) {
277 bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
278 }
else if (deterministic_) {
279 bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
280 }
else if (exhaustive_search_) {
282 data_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
283 int returned_algo_count;
285 cudnnConvolutionBwdDataAlgoPerf_t,
286 kNUM_CUDNN_BWD_DATA_ALGS>
288 cudnn_wrapper_.with_cudnn_state(
290 state->workspace().reset();
291 CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithm(
292 state->cudnn_handle(),
297 kNUM_CUDNN_BWD_DATA_ALGS,
298 &returned_algo_count,
299 data_perf_stat.data()));
302 LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
303 return data_perf_stat[0].algo;
306 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
307 cudnn_wrapper_.inline_cudnn_handle(),
312 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
313 cudnn_ws_nbytes_limit_,
317 size_t bwd_data_ws_size;
318 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
319 cudnn_wrapper_.inline_cudnn_handle(),
326 cudnn_ws_nbytes_ = bwd_data_ws_size;
327 VLOG(1) <<
"CuDNN algorithm: " << bwd_data_algo_;
328 VLOG(1) <<
"CuDNN workspace size: " << bwd_data_ws_size;
333 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
334 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
335 state->cudnn_handle(),
338 filter.template data<T>(),
340 X.template data<T>(),
343 state->workspace().get(cudnn_ws_nbytes_),
347 Y->template mutable_data<T>()));
350 if (InputSize() == 3) {
351 CUDNN_ENFORCE(cudnnAddTensor(
352 cudnn_wrapper_.inline_cudnn_handle(),
355 Input(BIAS).template data<T>(),
358 Y->template mutable_data<T>()));
366 template <
typename T>
368 auto& X = Input(INPUT);
369 auto& filter = Input(FILTER);
370 auto& dY = Input(OUTPUT_GRAD);
371 auto* dfilter = Output(FILTER_GRAD);
372 CAFFE_ENFORCE_EQ(X.ndim(), 4);
373 CAFFE_ENFORCE_EQ(filter.ndim(), 4);
376 case StorageOrder::NHWC:
379 case StorageOrder::NCHW:
383 LOG(FATAL) <<
"Unknown storage order: " << order_;
386 int N = 0, M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
388 case StorageOrder::NHWC:
395 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
396 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
397 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
398 CAFFE_ENFORCE_EQ(filter.dim32(3), C);
400 case StorageOrder::NCHW:
407 CAFFE_ENFORCE_EQ(filter.dim32(1), C);
408 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
409 CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
412 LOG(FATAL) <<
"Unknown storage order: " << order_;
416 dfilter->ResizeLike(filter);
419 bool input_changed = (X.dims() != cudnn_input_dims_);
420 bool filter_changed = (filter.dims() != cudnn_filter_dims_);
421 if (input_changed || filter_changed) {
422 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
424 cudnn_input_dims_ = X.dims();
425 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
434 if (filter_changed) {
435 cudnn_filter_dims_ = filter.dims();
436 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
445 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
456 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
468 "The current padding scheme leads to unequal padding on the top and " 469 "bottom, which is not supported by cudnn.");
473 "The current padding scheme leads to unequal padding on the left " 474 "and right, which is not supported by cudnn.");
475 #if CUDNN_VERSION_MIN(6,0,0) 476 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
484 CUDNN_CROSS_CORRELATION,
487 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
495 CUDNN_CROSS_CORRELATION));
497 #if CUDNN_VERSION_MIN(7, 0, 0) 500 if (enable_tensor_core_) {
502 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
505 if (force_algo_[ALGO_WGRAD] >= 0) {
507 (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
508 }
else if (deterministic_) {
509 algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
510 bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
511 }
else if (exhaustive_search_) {
513 filter_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
514 LOG(INFO) <<
"CUDNN Convolution bwd: doing exhaustive search.";
520 int returned_algo_count;
526 cudnnConvolutionBwdFilterAlgoPerf_t,
527 kNUM_CUDNN_BWD_FILTER_ALGS>
530 cudnn_wrapper_.with_cudnn_state(
532 state->workspace().reset();
533 CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithm(
534 state->cudnn_handle(),
539 kNUM_CUDNN_BWD_FILTER_ALGS,
540 &returned_algo_count,
541 filter_perf_stat.data()));
543 LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
544 return filter_perf_stat[0].algo;
548 forward_algo_cache_.getAlgorithm(X.dims(), filter.dims(), 0, [&]() {
549 int returned_algo_count;
550 std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
552 cudnn_wrapper_.with_cudnn_state(
554 state->workspace().reset();
555 CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithm(
556 state->cudnn_handle(),
561 kNUM_CUDNN_BWD_DATA_ALGS,
562 &returned_algo_count,
563 fwd_perf_stat.data()));
566 LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
567 return fwd_perf_stat[0].algo;
571 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
572 cudnn_wrapper_.inline_cudnn_handle(),
577 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
578 cudnn_ws_nbytes_limit_,
581 CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
582 cudnn_wrapper_.inline_cudnn_handle(),
587 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
588 cudnn_ws_nbytes_limit_,
592 size_t bwd_filter_ws_size, fwd_ws_size;
593 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
594 cudnn_wrapper_.inline_cudnn_handle(),
600 &bwd_filter_ws_size));
602 CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
603 cudnn_wrapper_.inline_cudnn_handle(),
610 cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
612 VLOG(1) <<
"CuDNN bwd algorithm: " << bwd_filter_algo_ <<
", " << algo_;
613 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
618 auto* dbias = Output(BIAS_OR_INPUT_GRAD);
620 CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
621 cudnn_wrapper_.inline_cudnn_handle(),
624 dY.template data<T>(),
627 dbias->template mutable_data<T>()));
630 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
631 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
632 state->cudnn_handle(),
635 dY.template data<T>(),
637 X.template data<T>(),
640 state->workspace().get(cudnn_ws_nbytes_),
644 dfilter->template mutable_data<T>()));
646 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
648 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
650 CUDNN_ENFORCE(cudnnConvolutionForward(
651 state->cudnn_handle(),
654 dY.template data<T>(),
656 filter.template data<T>(),
659 state->workspace().get(cudnn_ws_nbytes_),
663 dX->template mutable_data<T>()));
670 REGISTER_CUDNN_OPERATOR(
671 ConvTransposeGradient,
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...