Caffe2 - C++ API: caffe2/operators/relu_op

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/core/cudnn_wrappers.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 
 namespace caffe2 {
 
 class CuDNNReluOp final : public Operator<CUDAContext> {
  public:
   CuDNNReluOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
         order_(StringToStorageOrder(
             OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
     CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
     CUDNN_ENFORCE(cudnnSetActivationDescriptor(
         activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
   }
 
   ~CuDNNReluOp() {
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
     CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
   }
 
   template <typename T>
   bool DoRunWithType() {
     const auto& X = Input(0);
     auto* Y = Output(0);
 
     // Return if X is empty
     if (X.size() == 0) {
       Y->mutable_data<T>();
       return true;
     }
 
     // See if we need to reshape.
     if (X.dims() != cudnn_input_dims_) {
       VLOG(1) << "Setting descriptors.";
       cudnn_input_dims_ = X.dims();
       int C = 1, H = 1, W = 1;
       if (X.ndim() == 4) {
         // Normal 4-dimensional tensors for images.
         C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
         H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
         W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
       } else {
         // If X is not 4-dimensional, we will simply use H = 1 and W = 1
         // and wrap everything into C.
         C = X.size() / X.dim32(0);
       }
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
           GetCudnnTensorFormat(order_),
           cudnnTypeWrapper<T>::type,
           X.dim32(0),
           C,
           H,
           W));
     }
     CUDNN_ENFORCE(cudnnActivationForward(
         cudnn_wrapper_.inline_cudnn_handle(),
         activ_desc_,
         cudnnTypeWrapper<T>::kOne(),
         data_desc_,
         X.template data<T>(),
         cudnnTypeWrapper<T>::kZero(),
         data_desc_,
         Y->template mutable_data<T>()));
     return true;
   }
 
   bool RunOnDevice() override {
     // dispatch based on contents of tensor(s)
     const auto& X = Input(0);
     auto* Y = Output(0);
     Y->ResizeLike(X);
 
     if (X.IsType<float>()) {
       return DoRunWithType<float>();
     } else if (X.IsType<float16>()) {
       return DoRunWithType<float16>();
     } else {
       LOG(FATAL) << "Unsupported input types";
     }
     return true;
   }
 
  protected:
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t data_desc_;
   cudnnActivationDescriptor_t activ_desc_;
   vector<TIndex> cudnn_input_dims_;
   StorageOrder order_;
 };
 
 
 // Note: You can see that in CuDNNReluGradientOp, we abused the cudnn interface
 // by passing in the output tensor for both bottom and top. This is dependent on
 // the assumption that the Relu gradient actually does not rely on the bottom
 // data, or it treats input=0 the same way as input<0. This is of course not
 // very safe, but we have been running in this way in Caffe for a while so it
 // *might* be safe to assume so.
 class CuDNNReluGradientOp final : public Operator<CUDAContext> {
  public:
   CuDNNReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CUDAContext>(operator_def, ws),
         cudnn_wrapper_(&context_),
         order_(StringToStorageOrder(
             OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
     CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
     CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
     CUDNN_ENFORCE(cudnnSetActivationDescriptor(
         activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
   }
 
   ~CuDNNReluGradientOp() {
     CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
     CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
   }
 
   template <typename T>
   bool DoRunWithType() {
     const auto& Y = Input(0);
     const auto& dY = Input(1);
     auto* dX = Output(0);
 
     // Return if Y is empty
     if (Y.size() == 0) {
       dX->mutable_data<T>();
       return true;
     }
 
     // See if we need to reshape.
     if (Y.dims() != cudnn_input_dims_) {
       VLOG(1) << "Setting descriptors.";
       cudnn_input_dims_ = Y.dims();
       int C = 1, H = 1, W = 1;
       if (Y.ndim() == 4) {
         // Normal 4-dimensional tensors for images.
         C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
         H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
         W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
       } else {
         // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
         // and wrap everything into C.
         C = Y.size() / Y.dim32(0);
       }
       CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
           data_desc_,
           GetCudnnTensorFormat(order_),
           cudnnTypeWrapper<T>::type,
           Y.dim32(0),
           C,
           H,
           W));
     }
     CUDNN_ENFORCE(cudnnActivationBackward(
         cudnn_wrapper_.inline_cudnn_handle(),
         activ_desc_,
         cudnnTypeWrapper<T>::kOne(),
         data_desc_,
         Y.template data<T>(),
         data_desc_,
         dY.template data<T>(),
         data_desc_,
         // Note: strictly speaking, we should be using the input data in this
         // case, but for the ReLU case we rely on the underlying implementation
         // that only the output is needed to calculate the Relu gradient. This
         // will enable us to do memory optimization for in-place relu. To
         // ensure this is correct, a unit test is provided at
         // caffe2/python/operator_test/relu_op_test.py
         Y.template data<T>(),
         cudnnTypeWrapper<T>::kZero(),
         data_desc_,
         dX->template mutable_data<T>()));
     return true;
   }
 
   bool RunOnDevice() override {
     const auto& Y = Input(0);
     auto* dX = Output(0);
     dX->ResizeLike(Y);
 
     if (Y.IsType<float>()) {
       return DoRunWithType<float>();
     } else if (Y.IsType<float16>()) {
       return DoRunWithType<float16>();
     } else {
       LOG(FATAL) << "Unsupported input types";
     }
     return true;
   }
 
  protected:
   CuDNNWrapper cudnn_wrapper_;
   cudnnTensorDescriptor_t data_desc_;
   cudnnActivationDescriptor_t activ_desc_;
   vector<TIndex> cudnn_input_dims_;
   StorageOrder order_;
   // Input: Y, dY; Output: dX
 };
 
 namespace {
 REGISTER_CUDNN_OPERATOR(Relu, CuDNNReluOp);
 REGISTER_CUDNN_OPERATOR(ReluGradient, CuDNNReluGradientOp);
 }  // namespace
 }  // namespace caffe2
Facebook Open Source