Caffe2 - C++ API
A deep learning, cross platform ML framework
relu_op_cudnn.cc
1 #include "caffe2/core/context_gpu.h"
2 #include "caffe2/core/cudnn_wrappers.h"
3 #include "caffe2/core/operator.h"
4 #include "caffe2/core/types.h"
5 
6 namespace caffe2 {
7 
8 class CuDNNReluOp final : public Operator<CUDAContext> {
9  public:
10  CuDNNReluOp(const OperatorDef& operator_def, Workspace* ws)
11  : Operator<CUDAContext>(operator_def, ws),
12  cudnn_wrapper_(&context_),
13  order_(StringToStorageOrder(
14  OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
15  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
16  CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
17  CUDNN_ENFORCE(cudnnSetActivationDescriptor(
18  activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
19  }
20 
21  ~CuDNNReluOp() {
22  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
23  CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
24  }
25 
26  template <typename T>
27  bool DoRunWithType() {
28  const auto& X = Input(0);
29  auto* Y = Output(0);
30 
31  // Return if X is empty
32  if (X.size() == 0) {
33  Y->mutable_data<T>();
34  return true;
35  }
36 
37  // See if we need to reshape.
38  if (X.dims() != cudnn_input_dims_) {
39  VLOG(1) << "Setting descriptors.";
40  cudnn_input_dims_ = X.dims();
41  int C = 1, H = 1, W = 1;
42  if (X.ndim() == 4) {
43  // Normal 4-dimensional tensors for images.
44  C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
45  H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
46  W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
47  } else {
48  // If X is not 4-dimensional, we will simply use H = 1 and W = 1
49  // and wrap everything into C.
50  C = X.size() / X.dim32(0);
51  }
52  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
53  data_desc_,
54  GetCudnnTensorFormat(order_),
56  X.dim32(0),
57  C,
58  H,
59  W));
60  }
61  CUDNN_ENFORCE(cudnnActivationForward(
62  cudnn_wrapper_.inline_cudnn_handle(),
63  activ_desc_,
65  data_desc_,
66  X.template data<T>(),
68  data_desc_,
69  Y->template mutable_data<T>()));
70  return true;
71  }
72 
73  bool RunOnDevice() override {
74  // dispatch based on contents of tensor(s)
75  const auto& X = Input(0);
76  auto* Y = Output(0);
77  Y->ResizeLike(X);
78 
79  if (X.IsType<float>()) {
80  return DoRunWithType<float>();
81  } else if (X.IsType<float16>()) {
82  return DoRunWithType<float16>();
83  } else {
84  LOG(FATAL) << "Unsupported input types";
85  }
86  return true;
87  }
88 
89  protected:
90  CuDNNWrapper cudnn_wrapper_;
91  cudnnTensorDescriptor_t data_desc_;
92  cudnnActivationDescriptor_t activ_desc_;
93  vector<TIndex> cudnn_input_dims_;
94  StorageOrder order_;
95 };
96 
97 
98 // Note: You can see that in CuDNNReluGradientOp, we abused the cudnn interface
99 // by passing in the output tensor for both bottom and top. This is dependent on
100 // the assumption that the Relu gradient actually does not rely on the bottom
101 // data, or it treats input=0 the same way as input<0. This is of course not
102 // very safe, but we have been running in this way in Caffe for a while so it
103 // *might* be safe to assume so.
104 class CuDNNReluGradientOp final : public Operator<CUDAContext> {
105  public:
106  CuDNNReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
107  : Operator<CUDAContext>(operator_def, ws),
108  cudnn_wrapper_(&context_),
109  order_(StringToStorageOrder(
110  OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
111  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
112  CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
113  CUDNN_ENFORCE(cudnnSetActivationDescriptor(
114  activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
115  }
116 
118  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
119  CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
120  }
121 
122  template <typename T>
123  bool DoRunWithType() {
124  const auto& Y = Input(0);
125  const auto& dY = Input(1);
126  auto* dX = Output(0);
127 
128  // Return if Y is empty
129  if (Y.size() == 0) {
130  dX->mutable_data<T>();
131  return true;
132  }
133 
134  // See if we need to reshape.
135  if (Y.dims() != cudnn_input_dims_) {
136  VLOG(1) << "Setting descriptors.";
137  cudnn_input_dims_ = Y.dims();
138  int C = 1, H = 1, W = 1;
139  if (Y.ndim() == 4) {
140  // Normal 4-dimensional tensors for images.
141  C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
142  H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
143  W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
144  } else {
145  // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
146  // and wrap everything into C.
147  C = Y.size() / Y.dim32(0);
148  }
149  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
150  data_desc_,
151  GetCudnnTensorFormat(order_),
153  Y.dim32(0),
154  C,
155  H,
156  W));
157  }
158  CUDNN_ENFORCE(cudnnActivationBackward(
159  cudnn_wrapper_.inline_cudnn_handle(),
160  activ_desc_,
162  data_desc_,
163  Y.template data<T>(),
164  data_desc_,
165  dY.template data<T>(),
166  data_desc_,
167  // Note: strictly speaking, we should be using the input data in this
168  // case, but for the ReLU case we rely on the underlying implementation
169  // that only the output is needed to calculate the Relu gradient. This
170  // will enable us to do memory optimization for in-place relu. To
171  // ensure this is correct, a unit test is provided at
172  // caffe2/python/operator_test/relu_op_test.py
173  Y.template data<T>(),
175  data_desc_,
176  dX->template mutable_data<T>()));
177  return true;
178  }
179 
180  bool RunOnDevice() override {
181  const auto& Y = Input(0);
182  auto* dX = Output(0);
183  dX->ResizeLike(Y);
184 
185  if (Y.IsType<float>()) {
186  return DoRunWithType<float>();
187  } else if (Y.IsType<float16>()) {
188  return DoRunWithType<float16>();
189  } else {
190  LOG(FATAL) << "Unsupported input types";
191  }
192  return true;
193  }
194 
195  protected:
196  CuDNNWrapper cudnn_wrapper_;
197  cudnnTensorDescriptor_t data_desc_;
198  cudnnActivationDescriptor_t activ_desc_;
199  vector<TIndex> cudnn_input_dims_;
200  StorageOrder order_;
201  // Input: Y, dY; Output: dX
202 };
203 
204 namespace {
205 REGISTER_CUDNN_OPERATOR(Relu, CuDNNReluOp);
206 REGISTER_CUDNN_OPERATOR(ReluGradient, CuDNNReluGradientOp);
207 } // namespace
208 } // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:183
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnHandle_t inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread&#39;s cuda_stream.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:111