Caffe2 - C++ API: caffe2/operators/conv_op

 #include "caffe2/core/context_gpu.h"
 
 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/cudnn_wrappers.h"
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/operators/conv_op_cache_cudnn.h"
 #include "caffe2/operators/conv_pool_op_base.h"
 #include "caffe2/operators/op_utils_cudnn.h"
 
 namespace caffe2 {
 
 class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
  public:
   CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
       : ConvPoolOpBase<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
         cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
             "ws_nbytes_limit",
             kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
         exhaustive_search_(
             OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
         deterministic_(
             OperatorBase::GetSingleArgument<int>("deterministic", 0)),
         cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
         force_algo_(OperatorBase::GetRepeatedArgument<int>("force_algo", vector<int>{-1,-1,-1})),
         enable_tensor_core_(OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
     CHECK(!deterministic_ || !exhaustive_search_);
     CAFFE_ENFORCE(group_ > 0);
     CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
     for (int i = 0; i < kernel_.size(); ++i) {
       OPERATOR_NEEDS_FEATURE(
           pads_[i] == pads_[kernel_.size() + i],
           "The current padding scheme leads to unequal padding on the left "
           "and right, which is not supported by cudnn.");
     }
     // dilated convolution supported by some algorithms in cuDNN v6
 #if !(CUDNN_VERSION_MIN(6,0,0))
     OPERATOR_NEEDS_FEATURE(
         dilation_h() == 1 && dilation_w() == 1,
         "The cudnn convolution does not support dilation yet.");
 #endif
 
 #if CUDNN_VERSION_MIN(7, 0, 0)
     // verify TensorCore math is supported
     enable_tensor_core_ &= TensorCoreAvailable();
 #else
     enable_tensor_core_ = false;
 #endif
 
     bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
                                  OperatorBase::HasArgument("force_algo_dgrad") ||
                                  OperatorBase::HasArgument("force_algo_wgrad");
     if (OperatorBase::HasArgument("force_algo")) {
       CAFFE_ENFORCE(!individual_force_algo,
                    "Cannot specify both force_algo and any of",
                    "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
     } else {
       force_algo_ = std::vector<int>{-1,-1,-1};
       force_algo_[ALGO_FWD] =
           OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
       force_algo_[ALGO_DGRAD] =
           OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
       force_algo_[ALGO_WGRAD] =
           OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
     }
 
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
     CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
     CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
   }
 
   ~CudnnConvOpBase() {
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
     CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
     CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
   }
 
  protected:
   // A helper function to set up the tensor Nd desriptor, depending on the order
   // the group and the type given.
   template <typename T>
   void SetTensorNdDescriptorWithGroup(
       int size,
       cudnnTensorDescriptor_t tensorDesc,
       int N,
       int C,
       int H,
       int W,
       int D) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
     const int CC = C;
 #else
     const int CC = C / group_;
 #endif
     switch (order_) {
       case StorageOrder::NHWC:
         if (size == 4) {
           CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
               tensorDesc,
               cudnnTypeWrapper<T>::type,
               N,
               CC,
               H,
               W,
               H * W * C,
               1,
               W * C,
               C));
         } else {
           vector<int> dims = {N, H, W, D, CC};
           vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
           CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
               tensorDesc,
               cudnnTypeWrapper<T>::type,
               size > 3 ? size : 4,
               dims.data(),
               strides.data()));
         }
         break;
       case StorageOrder::NCHW:
         if (size == 4) {
           CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
               tensorDesc,
               cudnnTypeWrapper<T>::type,
               N,
               CC,
               H,
               W,
               C * H * W,
               H * W,
               W,
               1));
         } else {
           vector<int> dims = {N, CC, H, W, D};
           vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
           CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
               tensorDesc,
               cudnnTypeWrapper<T>::type,
               size > 3 ? size : 4,
               dims.data(),
               strides.data()));
         }
         break;
       default:
         LOG(FATAL) << "Unknown storage order: " << order_;
     }
   }
 
   void DuplicateConvDesc(
       cudnnConvolutionDescriptor_t input,
       size_t kernelDims,
       size_t dilationDims,
       cudnnConvolutionDescriptor_t copy) {
     if (kernelDims == 2) {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
       int pad_height = 0;
       int pad_width = 0;
       int stride_height = 0;
       int stride_width = 0;
       int dilation_height = 0;
       int dilation_width = 0;
 
 #if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           input,
           &pad_height,
           &pad_width,
           &stride_height,
           &stride_width,
           &dilation_height,
           &dilation_width,
           &mode,
           &dataType
           ));
 #else
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           input,
           &pad_height,
           &pad_width,
           &stride_height,
           &stride_width,
           &dilation_height,
           &dilation_width,
           &mode
           ));
 #endif
 
 #if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           copy,
           pad_height,
           pad_width,
           stride_height,
           stride_width,
           dilation_height,
           dilation_width,
           mode,
           dataType
           ));
 #else
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           copy,
           pad_height,
           pad_width,
           stride_height,
           stride_width,
           dilation_height,
           dilation_width,
           mode
           ));
 #endif
     } else {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
       int arrayLength = 0;
       vector<int> ones(dilationDims, 1);
       CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
           input,
           kernel_.size(),
           &arrayLength,
           pads_.data(),
           stride_.data(),
           ones.data(),
           &mode,
           &dataType));
 
       CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
           copy,
           kernel_.size(),
           pads_.data(),
           stride_.data(),
           ones.data(),
           mode,
           dataType));
     }
   }
 
   template <typename T>
   cudnnDataType_t DetermineComputeTypeFromInput(const T& X) {
     const cudaDeviceProp& prop = GetDeviceProperty(0);
     cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
     if (X.template IsType<float16>()) {
       if (float16_compute_ && prop.major >= 6) {
         VLOG(1) << "CUDNN Convolution: float16_compute specified and "
                 << "supported, input data is float16 - using float16 "
                 << "compute.";
         computeType = CUDNN_DATA_HALF;
       } else if (float16_compute_) {
         VLOG(1) << "CUDNN Convolution: float16_compute specified but"
                 << "not supported, input data is float16 - using float32 "
                 << "compute.";
       } else {
         VLOG(1) << "CUDNN Convolution: float16_compute not specified but "
                 << "input data is float16 - using float32 compute.";
       }
     } else {
       VLOG(1) << "CUDNN Convolution: using float32 compute.";
     }
     return computeType;
   }
 
   void SetConvDescFromArguments() {
 #if CUDNN_VERSION_MIN(6, 0, 0)
     if (kernel_.size() == 2) {
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc_,
           pad_t(),
           pad_l(),
           stride_h(),
           stride_w(),
           dilation_h(),
           dilation_w(),
           CUDNN_CROSS_CORRELATION,
           compute_type_));
     } else {
       CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
           conv_desc_,
           kernel_.size(),
           pads_.data(),
           stride_.data(),
           dilation_.data(),
           CUDNN_CROSS_CORRELATION,
           compute_type_));
     }
 #else
     if (kernel_.size() == 2) {
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc_,
           pad_t(),
           pad_l(),
           stride_h(),
           stride_w(),
           1,
           1,
           CUDNN_CROSS_CORRELATION));
     } else {
       vector<int> ones(dilation_.size(), 1);
       CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
           conv_desc_,
           kernel_.size(),
           pads_.data(),
           stride_.data(),
           ones.data(),
           CUDNN_CROSS_CORRELATION,
           compute_type_));
     }
 #endif
   }
 
   void SetConvDescComputeType(
       cudnnConvolutionDescriptor_t conv_desc,
       cudnnDataType_t math) {
     if (kernel_.size() == 2) {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
       int pad_height = 0;
       int pad_width = 0;
       int stride_height = 0;
       int stride_width = 0;
       int dilation_height = 0;
       int dilation_width = 0;
 
 #if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           conv_desc,
           &pad_height,
           &pad_width,
           &stride_height,
           &stride_width,
           &dilation_height,
           &dilation_width,
           &mode,
           &dataType
           ));
 #else
       CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
           conv_desc,
           &pad_height,
           &pad_width,
           &stride_height,
           &stride_width,
           &dilation_height,
           &dilation_width,
           &mode
           ));
 #endif
 
 #if CUDNN_VERSION_MIN(6, 0, 0)
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc,
           pad_height,
           pad_width,
           stride_height,
           stride_width,
           dilation_height,
           dilation_width,
           mode,
           math
           ));
 #else
       CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
           conv_desc,
           pad_height,
           pad_width,
           stride_height,
           stride_width,
           dilation_height,
           dilation_width,
           mode
           ));
 #endif
     } else {
       cudnnConvolutionMode_t mode;
       cudnnDataType_t dataType;
       int arrayLength = 0;
       vector<int> ones(dilation_.size(), 1);
       CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
           conv_desc,
           kernel_.size(),
           &arrayLength,
           pads_.data(),
           stride_.data(),
           ones.data(),
           &mode,
           &dataType));
 
       CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
           conv_desc,
           kernel_.size(),
           pads_.data(),
           stride_.data(),
           ones.data(),
           mode,
           math));
     }
   }
 
   vector<TIndex> cudnn_input_dims_;
   vector<TIndex> cudnn_filter_dims_;
 
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t bottom_desc_;
   cudnnFilterDescriptor_t filter_desc_;
   cudnnTensorDescriptor_t bias_desc_;
   cudnnTensorDescriptor_t top_desc_;
   // top desc for bias add in case we do group convolution
   cudnnTensorDescriptor_t top_desc_for_bias_;
   cudnnConvolutionDescriptor_t conv_desc_;
   const size_t cudnn_ws_nbytes_limit_;
   size_t cudnn_ws_nbytes_;
   bool exhaustive_search_;
   bool deterministic_;
   size_t cudnn_state_;
   vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
   bool enable_tensor_core_;
   cudnnDataType_t compute_type_;
 };
 
 class CudnnConvOp final : public CudnnConvOpBase {
  public:
   CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
       : CudnnConvOpBase(operator_def, ws) {}
 
   ~CudnnConvOp() {}
 
   template <typename T_X, typename T_W, typename T_B, typename T_Y>
   bool DoRunWithType();
 
   bool RunOnDevice() override;
 
  private:
   cudnnConvolutionFwdAlgo_t algo_;
   using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
   AlgorithmsCache<ConvFwdAlgorithmWithCost> algo_cache_;
   // Input: X, W, b
   // Output: Y
   INPUT_TAGS(INPUT, FILTER, BIAS);
 };
 
 class CudnnConvGradientOp final : public CudnnConvOpBase {
  public:
   CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : CudnnConvOpBase(operator_def, ws),
         no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
     CAFFE_ENFORCE(
         !(no_bias_ && OutputSize() == 3),
         "If bias is not present, you should not have 3 grad output.");
 
     CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
     CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
   }
 
   ~CudnnConvGradientOp() {
     CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
     CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
   }
 
   template <
       typename T_X,
       typename T_DY,
       typename T_W,
       typename T_B,
       typename T_DX,
       typename T_DW,
       typename T_DB>
   bool DoRunWithType();
 
   bool RunOnDevice() override;
 
  private:
   cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
   cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
   cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
   cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
   using ConvBwdFilterAlgorithmWithCost =
       std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
   using ConvBwdDataAlgorithmWithCost =
       std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
   AlgorithmsCache<ConvBwdFilterAlgorithmWithCost> filter_algo_cache_;
   AlgorithmsCache<ConvBwdDataAlgorithmWithCost> data_algo_cache_;
   bool no_bias_;
   // input: X, W, dY
   // output: dW, db, and optionally dX
   INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
   OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
 };
 
 // Implementations
 
 static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
     CUDNN_DATA_FLOAT,
     CUDNN_DATA_HALF};
 static constexpr std::array<const char*, 2> kComputePassNames = {
     "fp32 compute",
     "fp16 compute"};
 
 template <typename T_X, typename T_W, typename T_B, typename T_Y>
 bool CudnnConvOp::DoRunWithType() {
   auto& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto* Y = Output(0);
 
   // Figure out the output shape
   CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
   CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
   const int M = filter.dim32(0);
   ConvPoolOpBase<CUDAContext>::SetOutputSize(X, Y, M);
   int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
   int group_offset_X = 0, group_offset_Y = 0;
 
   switch (order_) {
     case StorageOrder::NHWC:
       N = X.dim32(0);
       H = X.dim32(1);
       W = X.ndim() > 3 ? X.dim32(2) : 1;
       D = X.ndim() > 4 ? X.dim32(3) : 1;
       C = X.dim32(X.ndim() - 1);
       H_out = Y->dim32(1);
       W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
       D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
       for (int i = 0; i < kernel_.size(); ++i) {
         CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
       }
       CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
       group_offset_X = C / group_;
       group_offset_Y = M / group_;
       break;
     case StorageOrder::NCHW:
       N = X.dim32(0);
       C = X.dim32(1);
       H = X.dim32(2);
       W = X.ndim() > 3 ? X.dim32(3) : 1;
       D = X.ndim() > 4 ? X.dim32(4) : 1;
       H_out = Y->dim32(2);
       W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
       D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
       CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
       for (int i = 0; i < kernel_.size(); ++i) {
         CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
       }
       group_offset_X = C / group_ * H * W * D;
       group_offset_Y = M / group_ * H_out * W_out * D_out;
       break;
     default:
       LOG(FATAL) << "Unknown storage order: " << order_;
   }
 
   CAFFE_ENFORCE(
       C % group_ == 0,
       "If you set group, the number of input channels should be divisible "
       "by group.");
   CAFFE_ENFORCE(
       M % group_ == 0,
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
   int group_offset_filter = filter.size() / group_;
 
   // Set up the cudnn algorithms & workspace if necessary
   bool input_changed = (X.dims() != cudnn_input_dims_);
   bool filter_changed = (filter.dims() != cudnn_filter_dims_);
   if (input_changed || filter_changed) {
     VLOG(1) << "Changing the cudnn descriptor configurations.";
     if (input_changed) {
       cudnn_input_dims_ = X.dims();
       SetTensorNdDescriptorWithGroup<T_X>(
           X.ndim(), bottom_desc_, N, C, H, W, D);
     }
     if (filter_changed) {
       cudnn_filter_dims_ = filter.dims();
       if (kernel_.size() == 2) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
         const int MM = M;
 #else
         const int MM = M / group_;
 #endif
         CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
             GetCudnnTensorFormat(order_),
             MM,
             C / group_,
             kernel_h(),
             kernel_w()));
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
         dims[0] /= group_;
 #if !CUDNN_VERSION_MIN(7, 0, 0)
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
 #endif
         dims[filter.ndim() - 1] /= group_;
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
             GetCudnnTensorFormat(order_),
             dims.size(),
             dims.data()));
       }
       if (InputSize() == 3) {
         if (kernel_.size() == 2) {
           CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
               bias_desc_,
               GetCudnnTensorFormat(order_),
               cudnnTypeWrapper<T_B>::type,
               1,
               M,
               1,
               1));
         } else {
           std::vector<int> bias_dims(X.ndim(), 1);
           bias_dims[1] = M;
           std::vector<int> strides = {M, 1, 1, 1, 1, 1};
           CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
               bias_desc_,
               cudnnTypeWrapper<T_B>::type,
               X.ndim() > 3 ? X.ndim() : 4,
               bias_dims.data(),
               strides.data()));
         }
       }
     }
     // Set the output
     SetTensorNdDescriptorWithGroup<T_Y>(
         X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
     // Set the output with descriptor useful for bias addition in one run.
     if (kernel_.size() == 2) {
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           top_desc_for_bias_,
           GetCudnnTensorFormat(order_),
           cudnnTypeWrapper<T_B>::type,
           N,
           M,
           H_out,
           W_out));
     } else {
       vector<int> dims = {N, M, H_out, W_out, D_out};
       vector<int> strides = {M * H_out * W_out * D_out,
                              H_out * W_out * D_out,
                              W_out * D_out,
                              D_out,
                              1};
       CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
           top_desc_for_bias_,
           cudnnTypeWrapper<T_B>::type,
           X.ndim() > 3 ? X.ndim() : 4,
           dims.data(),
           strides.data()));
     }
 
     compute_type_ = DetermineComputeTypeFromInput(X);
     SetConvDescFromArguments();
 
 #if CUDNN_VERSION_MIN(7, 0, 0)
     if (enable_tensor_core_) {
       CUDNN_ENFORCE(
           cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
     }
 
     // enable cuDNN conv groups
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
 #endif
 
     if (force_algo_[ALGO_FWD] >= 0) {
       algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
     } else if (deterministic_) {
       algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     } else if (exhaustive_search_) {
       // Even when FP16 compute is supported and requested, try FP32
       // because it may be faster. However, if FP32 compute is specified,
       // FP16 is not a suitable alternative - early out from the loop.
       std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
       for (int i = 0; i < 2; i++) {
         SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
 
         algosToCompare[i] = algo_cache_.getAlgorithm(
             X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
               VLOG(1) << "CUDNN Convolution fwd: doing exhaustive "
                       << "search for " << kComputePassNames[i];
               // When we do an exhaustive search, we will ignore the workspace
               // size limit and simply go for the fastest algorithm. If you
               // happen to run out of memory later, you will be on your own...
               int returned_algo_count;
               std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
                   fwd_perf_stat;
 
               // no need to clean up workspace,
               cudnn_wrapper_.with_cudnn_state(
                   cudnn_state_, [&](CuDNNState* state) {
                     // Actually run the search.
                     CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
                         state->cudnn_handle(),
                         bottom_desc_,
                         X.template data<T_X>(),
                         filter_desc_,
                         filter.template data<T_W>(),
                         conv_desc_,
                         top_desc_,
                         Y->template mutable_data<T_Y>(),
                         kNUM_CUDNN_FWD_ALGS,
                         &returned_algo_count,
                         fwd_perf_stat.data(),
                         state->workspace().get(cudnn_ws_nbytes_limit_),
                         cudnn_ws_nbytes_limit_));
                   });
               LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
               float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
                   ? fwd_perf_stat[0].time
                   : 1e10;
               return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
             });
 
         // When set to fp32 compute, don't try fp16
         if (compute_type_ == CUDNN_DATA_FLOAT) {
           break;
         }
       }
 
       if (compute_type_ == CUDNN_DATA_FLOAT) {
         // For FP32 compute, just use the best FP32 algorithm
         algo_ = std::get<0>(algosToCompare[0]);
       } else {
         // For FP16 compute, choose algo with fastest execution
         int bestAlgoIndex =
             (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
             ? 0
             : 1;
         algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
         SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
       }
     } else {
       // Get the convolution algorithm based on the workspace limit.
       CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
           cudnn_wrapper_.inline_cudnn_handle(),
           bottom_desc_,
           filter_desc_,
           conv_desc_,
           top_desc_,
           CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
           cudnn_ws_nbytes_limit_,
           &algo_));
     }
     CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
         cudnn_wrapper_.inline_cudnn_handle(),
         bottom_desc_,
         filter_desc_,
         conv_desc_,
         top_desc_,
         algo_,
         &cudnn_ws_nbytes_));
     VLOG(1) << "CuDNN algorithm: " << algo_;
     VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
   }
 
   // Now, actually run the computation.
   // Run directly through cuDNN if possible
 #if CUDNN_VERSION_MIN(7,0,0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionForward(
         state->cudnn_handle(),
         cudnnTypeWrapper<T_X>::kOne(),
         bottom_desc_,
         X.template data<T_X>(),
         filter_desc_,
         filter.template data<T_W>(),
         conv_desc_,
         algo_,
         state->workspace().get(cudnn_ws_nbytes_),
         cudnn_ws_nbytes_,
         cudnnTypeWrapper<T_Y>::kZero(),
         top_desc_,
         Y->template mutable_data<T_Y>()));
   });
 #else
   // otherwise manually run through groups
   for (int i = 0; i < group_; ++i) {
     cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
       CUDNN_ENFORCE(cudnnConvolutionForward(
           state->cudnn_handle(),
           cudnnTypeWrapper<T_X>::kOne(),
           bottom_desc_,
           X.template data<T_X>() + i * group_offset_X,
           filter_desc_,
           filter.template data<T_W>() + i * group_offset_filter,
           conv_desc_,
           algo_,
           state->workspace().get(cudnn_ws_nbytes_),
           cudnn_ws_nbytes_,
           cudnnTypeWrapper<T_Y>::kZero(),
           top_desc_,
           Y->template mutable_data<T_Y>() + i * group_offset_Y));
     });
   }
 #endif
   // Bias
   if (InputSize() == 3) {
     auto& bias = Input(BIAS);
 
     CAFFE_ENFORCE_EQ(bias.ndim(), 1);
     CAFFE_ENFORCE_EQ(bias.dim32(0), M);
 
     CUDNN_ENFORCE(cudnnAddTensor(
         cudnn_wrapper_.inline_cudnn_handle(),
         cudnnTypeWrapper<T_B>::kOne(),
         bias_desc_,
         bias.template data<T_B>(),
         cudnnTypeWrapper<T_Y>::kOne(),
         top_desc_for_bias_,
         Y->template mutable_data<T_Y>()));
   }
   // Done.
   return true;
 }
 
 bool CudnnConvOp::RunOnDevice() {
   if (Input(0).IsType<float>()) {
     return DoRunWithType<
         float, // X
         float, // W
         float, // B
         float>(); // Y
   } else if (Input(0).IsType<float16>()) {
     return DoRunWithType<
         float16, // X
         float16, // W
         float16, // B
         float16>(); // Y
   } else {
     LOG(FATAL) << "Only float (32bit) and float16 are supported by "
                << "cudnn convolution, but input " << debug_def().input(0)
                << " has [" << Input(0).meta().name() << "]";
   }
   return true;
 }
 
 template <
     typename T_X,
     typename T_DY,
     typename T_W,
     typename T_B,
     typename T_DX,
     typename T_DW,
     typename T_DB>
 bool CudnnConvGradientOp::DoRunWithType() {
   auto& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto& dY = Input(OUTPUT_GRAD);
   auto* dfilter = Output(FILTER_GRAD);
 
   CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
   CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
 
   const int M = filter.dim32(0);
   int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
   int group_offset_X = 0, group_offset_Y = 0;
 
   switch (order_) {
     case StorageOrder::NHWC:
       N = X.dim32(0);
       H = X.dim32(1);
       W = X.ndim() > 3 ? X.dim32(2) : 1;
       D = X.ndim() > 4 ? X.dim32(3) : 1;
       C = X.dim32(X.ndim() - 1);
       H_out = dY.dim32(1);
       W_out = dY.ndim() > 3 ? dY.dim32(2) : 1;
       D_out = dY.ndim() > 4 ? dY.dim32(3) : 1;
       for (int i = 0; i < kernel_.size(); ++i) {
         CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
       }
       CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
       group_offset_X = C / group_;
       group_offset_Y = M / group_;
       break;
     case StorageOrder::NCHW:
       N = X.dim32(0);
       C = X.dim32(1);
       H = X.dim32(2);
       W = X.ndim() > 3 ? X.dim32(3) : 1;
       D = X.ndim() > 4 ? X.dim32(4) : 1;
       H_out = dY.dim32(2);
       W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
       D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
       CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
       for (int i = 0; i < kernel_.size(); ++i) {
         CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
       }
       group_offset_X = C / group_ * H * W * D;
       group_offset_Y = M / group_ * H_out * W_out * D_out;
       break;
     default:
       LOG(FATAL) << "Unknown storage order: " << order_;
   }
 
   CAFFE_ENFORCE(
       C % group_ == 0,
       "If you set group, the number of input channels should be divisible "
       "by group.");
   CAFFE_ENFORCE(
       M % group_ == 0,
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
   int group_offset_filter = filter.size() / group_;
   if (kernel_.size() == 1) {
     ConvPoolOpBase<CUDAContext>::ComputePads({H});
   } else if (kernel_.size() == 2) {
     ConvPoolOpBase<CUDAContext>::ComputePads({H, W});
   } else if (kernel_.size() == 3) {
     ConvPoolOpBase<CUDAContext>::ComputePads({H, W, D});
   } else {
     CAFFE_THROW("Unsupported kernel size:", kernel_.size());
   }
   dfilter->ResizeLike(filter);
 
   // Set up the cudnn algorithms & workspace if necessary
   bool input_changed = (X.dims() != cudnn_input_dims_);
   bool filter_changed = (filter.dims() != cudnn_filter_dims_);
   if (input_changed || filter_changed) {
     VLOG(1) << "Changing the cudnn descriptor configurations.";
     if (input_changed) {
       cudnn_input_dims_ = X.dims();
       SetTensorNdDescriptorWithGroup<T_X>(
           X.ndim(), bottom_desc_, N, C, H, W, D);
     }
     if (filter_changed) {
       cudnn_filter_dims_ = filter.dims();
       if (kernel_.size() == 2) {
 #if CUDNN_VERSION_MIN(7, 0, 0)
         const int MM = M;
 #else
         const int MM = M / group_;
 #endif
         CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
             GetCudnnTensorFormat(order_),
             MM,
             C / group_,
             kernel_h(),
             kernel_w()));
       } else {
         vector<int> dims(filter.dims().begin(), filter.dims().end());
 #if !CUDNN_VERSION_MIN(7, 0, 0)
         dims[0] /= group_;
 #endif
         order_ == StorageOrder::NCHW ? dims[1] /= group_
                                      : dims[filter.ndim() - 1] /= group_;
         CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
             filter_desc_,
             cudnnTypeWrapper<T_W>::type,
             GetCudnnTensorFormat(order_),
             dims.size(),
             dims.data()));
       }
       if (!no_bias_) {
         if (kernel_.size() == 2) {
           CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
               bias_desc_,
               GetCudnnTensorFormat(order_),
               cudnnTypeWrapper<T_B>::type,
               1,
               M,
               1,
               1));
         } else {
           std::vector<int> bias_dims(X.ndim(), 1);
           bias_dims[1] = M;
           std::vector<int> strides = {M, 1, 1, 1, 1, 1};
           CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
               bias_desc_,
               cudnnTypeWrapper<T_B>::type,
               X.ndim() > 3 ? X.ndim() : 4,
               bias_dims.data(),
               strides.data()));
         }
       }
     }
     // Set the output
     SetTensorNdDescriptorWithGroup<T_DX>(
         X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
     // Set the output with descriptor useful for bias addition in one run.
     if (kernel_.size() == 2) {
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           top_desc_for_bias_,
           GetCudnnTensorFormat(order_),
           cudnnTypeWrapper<T_B>::type,
           N,
           M,
           H_out,
           W_out));
     } else {
       vector<int> dims = {N, M, H_out, W_out, D_out};
       vector<int> strides = {M * H_out * W_out * D_out,
                              H_out * W_out * D_out,
                              W_out * D_out,
                              D_out,
                              1};
       CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
           top_desc_for_bias_,
           cudnnTypeWrapper<T_B>::type,
           X.ndim() > 3 ? X.ndim() : 4,
           dims.data(),
           strides.data()));
     }
 
     compute_type_ = DetermineComputeTypeFromInput(X);
     SetConvDescFromArguments();
 
     DuplicateConvDesc(
         conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
     DuplicateConvDesc(
         conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
 
 #if CUDNN_VERSION_MIN(7, 0, 0)
     if (enable_tensor_core_) {
       CUDNN_ENFORCE(cudnnSetConvolutionMathType(
           bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
       CUDNN_ENFORCE(cudnnSetConvolutionMathType(
           bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
     }
 
     // set cuDNN groups if appropriate
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
     CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
 #endif
 
     // Choose dW algorithm
     if (force_algo_[ALGO_WGRAD] >= 0) {
       bwd_filter_algo_ =
           (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
     } else if (deterministic_) {
       bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
     } else if (exhaustive_search_) {
       // Even when FP16 compute is supported and requested, try FP32
       // because it may be faster. However, if FP32 compute is specified,
       // FP16 is not a suitable alternative - early out from the loop.
       std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
       for (int i = 0; i < 2; i++) {
         SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
 
         algosToCompare[i] = filter_algo_cache_.getAlgorithm(
             X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
               VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive"
                       << "search for " << kComputePassNames[i];
               // When we do an exhaustive search, we will ignore the workspace
               // size limit and simply go for the fastest algorithm. If you
               // happen to run out of memory later, you will be on your own...
               int returned_algo_count;
               // We clean up the current workspace memory so that the forward
               // algorithm is free to allocate memory.
               // Actually run the search.
               std::array<
                   cudnnConvolutionBwdFilterAlgoPerf_t,
                   kNUM_CUDNN_BWD_FILTER_ALGS>
                   filter_perf_stat;
 
               cudnn_wrapper_.with_cudnn_state(
                   cudnn_state_, [&](CuDNNState* state) {
                     CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
                         state->cudnn_handle(),
                         bottom_desc_,
                         X.template data<T_X>(),
                         top_desc_,
                         dY.template data<T_DY>(),
                         bwd_filter_conv_desc_,
                         filter_desc_,
                         dfilter->template mutable_data<T_DW>(),
                         kNUM_CUDNN_BWD_FILTER_ALGS,
                         &returned_algo_count,
                         filter_perf_stat.data(),
                         state->workspace().get(cudnn_ws_nbytes_limit_),
                         cudnn_ws_nbytes_limit_));
                   });
               LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
               float algo_time =
                   filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
                   ? filter_perf_stat[0].time
                   : 1e10;
               return ConvBwdFilterAlgorithmWithCost(
                   filter_perf_stat[0].algo, algo_time);
             });
 
         // When set to fp32 compute, don't try fp16
         if (compute_type_ == CUDNN_DATA_FLOAT) {
           break;
         }
       }
 
       if (compute_type_ == CUDNN_DATA_FLOAT) {
         // For FP32 compute, just use the best FP32 algorithm
         bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
       } else {
         // For FP16 compute, choose algo with fastest execution
         int bestAlgoIndex =
             (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
             ? 0
             : 1;
         bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
         SetConvDescComputeType(
             bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
       }
     } else {
       // choose backward algorithm for filter
       CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
           cudnn_wrapper_.inline_cudnn_handle(),
           bottom_desc_,
           top_desc_,
           bwd_filter_conv_desc_,
           filter_desc_,
           CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
           cudnn_ws_nbytes_limit_,
           &bwd_filter_algo_));
     }
     // Pick dX algo if needed
     if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
       if (force_algo_[ALGO_DGRAD] >= 0) {
         bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
       } else if (deterministic_) {
         bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       } else if (exhaustive_search_) {
         // Even when FP16 compute is supported and requested, try FP32
         // because it may be faster. However, if FP32 compute is specified,
         // FP16 is not a suitable alternative - early out from the loop.
         std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
         for (int i = 0; i < 2; i++) {
           SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
 
           algosToCompare[i] = data_algo_cache_.getAlgorithm(
               X.dims(), filter.dims(), kComputeTypesToTry[i], [&]() {
                 VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive"
                         << "search for " << kComputePassNames[i];
                 int returned_algo_count;
 
                 std::array<
                     cudnnConvolutionBwdDataAlgoPerf_t,
                     kNUM_CUDNN_BWD_DATA_ALGS>
                     data_perf_stat;
                 cudnn_wrapper_.with_cudnn_state(
                     cudnn_state_, [&](CuDNNState* state) {
                       auto* dX =
                           Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
                       dX->ResizeLike(X);
                       const T_W* filter_data = filter.template data<T_W>();
                       const T_DY* dYdata = dY.template data<T_DY>();
                       T_DX* dXdata = dX->template mutable_data<T_DX>();
                       CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
                           state->cudnn_handle(),
                           filter_desc_,
                           filter_data,
                           top_desc_,
                           dYdata,
                           bwd_data_conv_desc_,
                           bottom_desc_,
                           dXdata,
                           kNUM_CUDNN_BWD_DATA_ALGS,
                           &returned_algo_count,
                           data_perf_stat.data(),
                           state->workspace().get(cudnn_ws_nbytes_limit_),
                           cudnn_ws_nbytes_limit_));
                     });
 
                 LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
                 float algo_time =
                     data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
                     ? data_perf_stat[0].time
                     : 1e10;
                 return ConvBwdDataAlgorithmWithCost(
                     data_perf_stat[0].algo, algo_time);
               });
 
           // When set to fp32 compute, don't try fp16
           if (compute_type_ == CUDNN_DATA_FLOAT) {
             break;
           }
         }
 
         if (compute_type_ == CUDNN_DATA_FLOAT) {
           // For FP32 compute, just use the best FP32 algorithm
           bwd_data_algo_ = std::get<0>(algosToCompare[0]);
         } else {
           // For FP16 compute, choose algo with fastest execution
           int bestAlgoIndex =
               (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
               ? 0
               : 1;
           bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
           SetConvDescComputeType(
               bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
         }
       } else {
         CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
             cudnn_wrapper_.inline_cudnn_handle(),
             filter_desc_,
             top_desc_,
             bwd_data_conv_desc_,
             bottom_desc_,
             CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
             cudnn_ws_nbytes_limit_,
             &bwd_data_algo_));
       }
     }
 
     // get workspace size for backwards filter algorithm
     size_t bwd_filter_ws_size, bwd_data_ws_size;
 
     CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
         cudnn_wrapper_.inline_cudnn_handle(),
         bottom_desc_,
         top_desc_,
         bwd_filter_conv_desc_,
         filter_desc_,
         bwd_filter_algo_,
         &bwd_filter_ws_size));
     if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
       // get workspace size for backwards data algorithm
       CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
           cudnn_wrapper_.inline_cudnn_handle(),
           filter_desc_,
           top_desc_,
           bwd_data_conv_desc_,
           bottom_desc_,
           bwd_data_algo_,
           &bwd_data_ws_size));
     } else {
       bwd_data_ws_size = 0;
     }
     cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
 
     VLOG(1) << "CuDNN bwd data & filter algorithm: " << bwd_data_algo_ << ", "
             << bwd_filter_algo_;
     VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
   }
 
   // Now, actually run the computation.
   if (!no_bias_) {
     auto* dbias = Output(BIAS_OR_INPUT_GRAD);
     dbias->Resize(M);
     CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
         cudnn_wrapper_.inline_cudnn_handle(),
         cudnnTypeWrapper<T_DY>::kOne(),
         top_desc_for_bias_,
         dY.template data<T_DY>(),
         cudnnTypeWrapper<T_DB>::kZero(),
         bias_desc_,
         dbias->template mutable_data<T_DB>()));
   }
 
 #if CUDNN_VERSION_MIN(7, 0, 0)
   cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
     CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
         state->cudnn_handle(),
         cudnnTypeWrapper<T_X>::kOne(),
         bottom_desc_,
         X.template data<T_X>(),
         top_desc_,
         dY.template data<T_DY>(),
         bwd_filter_conv_desc_,
         bwd_filter_algo_,
         state->workspace().get(cudnn_ws_nbytes_),
         cudnn_ws_nbytes_,
         cudnnTypeWrapper<T_DW>::kZero(),
         filter_desc_,
         dfilter->template mutable_data<T_DW>()));
     if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
       // Compute the gradient w.r.t. the input.
       auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
       dX->ResizeLike(X);
       CUDNN_ENFORCE(cudnnConvolutionBackwardData(
           state->cudnn_handle(),
           cudnnTypeWrapper<T_W>::kOne(),
           filter_desc_,
           filter.template data<T_W>(),
           top_desc_,
           dY.template data<T_DY>(),
           bwd_data_conv_desc_,
           bwd_data_algo_,
           state->workspace().get(cudnn_ws_nbytes_),
           cudnn_ws_nbytes_,
           cudnnTypeWrapper<T_DX>::kZero(),
           bottom_desc_,
           dX->template mutable_data<T_DX>()));
     }
   });
 #else
   for (int i = 0; i < group_; ++i) {
     cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
       CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
           state->cudnn_handle(),
           cudnnTypeWrapper<T_X>::kOne(),
           bottom_desc_,
           X.template data<T_X>() + i * group_offset_X,
           top_desc_,
           dY.template data<T_DY>() + i * group_offset_Y,
           bwd_filter_conv_desc_,
           bwd_filter_algo_,
           state->workspace().get(cudnn_ws_nbytes_),
           cudnn_ws_nbytes_,
           cudnnTypeWrapper<T_DW>::kZero(),
           filter_desc_,
           dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
       if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
         // Compute the gradient w.r.t. the input.
         auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
         dX->ResizeLike(X);
         CUDNN_ENFORCE(cudnnConvolutionBackwardData(
             state->cudnn_handle(),
             cudnnTypeWrapper<T_W>::kOne(),
             filter_desc_,
             filter.template data<T_W>() + i * group_offset_filter,
             top_desc_,
             dY.template data<T_DY>() + i * group_offset_Y,
             bwd_data_conv_desc_,
             bwd_data_algo_,
             state->workspace().get(cudnn_ws_nbytes_),
             cudnn_ws_nbytes_,
             cudnnTypeWrapper<T_DX>::kZero(),
             bottom_desc_,
             dX->template mutable_data<T_DX>() + i * group_offset_X));
       }
     });
   }
 #endif
   return true;
 }
 
 // TODO(Yangqing): a lot of the function contents are very similar. Consider
 // consolidating them.
 bool CudnnConvGradientOp::RunOnDevice() {
   if (Input(0).IsType<float>()) {
     return DoRunWithType<
         float, //  X
         float, // dY
         float, //  W
         float, //  b
         float, // dX
         float, // dW
         float>(); // db
   } else if (Input(0).IsType<float16>()) {
     return DoRunWithType<
         float16, //  X
         float16, // dY
         float16, //  W
         float16, //  b
         float16, // dX
         float16, // dW
         float16>(); // db
   } else {
     LOG(FATAL) << "Unsupported input types";
   }
   return true;
 }
 
 REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp);
 REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp);
 
 REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp);
 REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp);
 
 REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp);
 REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp);
 
 REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp);
 REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp);
 
 } // namespace caffe2
Facebook Open Source