Caffe2 - C++ API
A deep learning, cross platform ML framework
dropout_op_cudnn.cc
1 #include "caffe2/core/context_gpu.h"
2 #include "caffe2/core/cudnn_wrappers.h"
3 #include "caffe2/core/operator.h"
4 #include "caffe2/core/types.h"
5 
6 namespace caffe2 {
7 
8 // cudnnRestoreDropoutDescriptor is needed for correctness and
9 // doesn't exist prior to cuDNN v7
10 #if CUDNN_VERSION_MIN(7,0,0)
11 
12 class CuDNNDropoutOp final : public Operator<CUDAContext> {
13  public:
14  USE_OPERATOR_FUNCTIONS(CUDAContext);
15 
16  CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
17  : Operator<CUDAContext>(operator_def, ws),
18  cudnn_wrapper_(&context_),
19  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
20  is_test_(
21  OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
22  states_initialized_(false),
23  random_seed_(operator_def.device_option().random_seed()) {
24  CAFFE_ENFORCE_GE(ratio_, 0);
25  CAFFE_ENFORCE_LT(ratio_, 1);
26  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
27 
28  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
29  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
30  cudnn_wrapper_.inline_cudnn_handle(),
31  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
32 
33  if (!is_test_) {
34  scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1)));
35  CAFFE_ENFORCE(scratch_blob_);
36  }
37  }
38 
39  ~CuDNNDropoutOp() noexcept {
40  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
41  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
42  }
43 
44  template <typename T, typename M>
45  bool DoRunWithType();
46 
47  bool RunOnDevice() override;
48 
49  static string scratch_blob_name(string mask_blob_name) {
50  return "cudnn_dropout_scratch_" + mask_blob_name;
51  }
52 
53  protected:
54  CuDNNWrapper cudnn_wrapper_;
55  cudnnTensorDescriptor_t data_desc_;
56  cudnnDropoutDescriptor_t dropout_desc_;
57 
58  vector<TIndex> cudnn_input_dims_;
59 
60  float ratio_;
61  bool is_test_;
62 
63  Blob* scratch_blob_ = nullptr;
64 
65  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
66  // Input: X, Output: Y, mask_and_states
67 
68  // track whether states have been initialized - only needs to happen once
69  bool states_initialized_;
70 
71  // random seed
72  unsigned long long random_seed_;
73 };
74 
75 class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
76  public:
77  USE_OPERATOR_FUNCTIONS(CUDAContext);
78  CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
79  : Operator<CUDAContext>(operator_def, ws),
80  cudnn_wrapper_(&context_),
81  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
82  is_test_(
83  OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
84  states_initialized_(false),
85  random_seed_(operator_def.device_option().random_seed()) {
86  CAFFE_ENFORCE_GE(ratio_, 0);
87  CAFFE_ENFORCE_LT(ratio_, 1);
88  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
89 
90  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
91  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
92  cudnn_wrapper_.inline_cudnn_handle(),
93  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
94 
95  // Share scratch with the forward op
96  scratch_blob_ =
97  ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1)));
98  CAFFE_ENFORCE(scratch_blob_);
99  }
100 
101  ~CuDNNDropoutGradientOp() noexcept {
102  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
103  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
104  }
105 
106  template <typename T, typename M>
107  bool DoRunWithType();
108 
109  bool RunOnDevice() override;
110 
111  protected:
112  CuDNNWrapper cudnn_wrapper_;
113  cudnnTensorDescriptor_t data_desc_;
114  cudnnDropoutDescriptor_t dropout_desc_;
115 
116  vector<TIndex> cudnn_input_dims_;
117 
118  Blob* scratch_blob_;
119 
120  float ratio_;
121  bool is_test_;
122 
123  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
124  // Input: dY, mask_and_states, Output: dX
125 
126  // only need to initialize states once (size is static)
127  bool states_initialized_;
128 
129  unsigned long long random_seed_;
130 };
131 
132 template <typename T, typename M>
133 bool CuDNNDropoutOp::DoRunWithType() {
134  const auto& X = Input(0);
135  auto* Y = Output(0);
136 
137  auto size_prod = 1;
138  for (auto dim : X.dims()) {
139  size_prod *= dim;
140  }
141  // now actually run the computation
142  if (is_test_) {
143  if (Y != &X) {
144  context_.Copy<T, CUDAContext, CUDAContext>(
145  X.size(), X.template data<T>(), Y->template mutable_data<T>());
146  }
147  return true;
148  } else {
149  auto* mask = Output(1);
150  // Reshape tensor descriptors if necessary
151  if (X.dims() != cudnn_input_dims_ && !is_test_) {
152  CAFFE_ENFORCE(scratch_blob_);
153  Tensor<CUDAContext>* states =
154  scratch_blob_->GetMutable<Tensor<CUDAContext>>();
155  cudnn_input_dims_ = X.dims();
156  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
157  data_desc_,
158  GetCudnnTensorFormat(StorageOrder::NCHW),
159  cudnnTypeWrapper<T>::type,
160  size_prod,
161  1,
162  1,
163  1));
164 
165  // get the reserve space we need
166  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
167  data_desc_, &reserve_space_size_in_bytes_));
168 
169  mask->Resize(reserve_space_size_in_bytes_);
170  states->Resize(states_size_in_bytes_);
171 
172  if (!states_initialized_) {
173  // set the dropout descriptor (note: need to allocate the states data
174  // before acquiring the mutex)
175  uint8_t* states_data = states->mutable_data<uint8_t>();
176  {
177  // Need to protect as clashes with NCCL
178  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
179  CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
180  dropout_desc_,
181  cudnn_wrapper_.inline_cudnn_handle(),
182  ratio_,
183  states_data,
184  states_size_in_bytes_,
185  random_seed_
186  ));
187  }
188  states_initialized_ = true;
189  }
190  }
191  CUDNN_ENFORCE(cudnnDropoutForward(
192  cudnn_wrapper_.inline_cudnn_handle(),
193  dropout_desc_,
194  data_desc_,
195  X.template data<T>(),
196  data_desc_,
197  Y->template mutable_data<T>(),
198  mask->mutable_data<uint8_t>(),
199  reserve_space_size_in_bytes_));
200  }
201  return true;
202 }
203 
204 bool CuDNNDropoutOp::RunOnDevice() {
205  // dispatch based on contents of tensor(s)
206  const auto& X = Input(0);
207  auto* Y = Output(0);
208  Y->ResizeLike(X);
209 
210  if (X.IsType<float>()) {
211  return DoRunWithType<float, float>();
212  } else if (X.IsType<float16>()) {
213  return DoRunWithType<float16, float>();
214  }
215  return false;
216 }
217 
218 template <typename T, typename M>
219 bool CuDNNDropoutGradientOp::DoRunWithType() {
220  const auto& dY = Input(0);
221  const auto& mask = Input(1);
222  const Tensor<CUDAContext>& states = scratch_blob_->Get<Tensor<CUDAContext>>();
223  auto* dX = Output(0);
224 
225  auto size_prod = 1;
226  for (auto dim : dY.dims()) {
227  size_prod *= dim;
228  }
229 
230  if (!states_initialized_) {
231  // set the dropout descriptor
232  {
233  // Need to protect as clashes with NCCL
234  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
235  CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor(
236  dropout_desc_,
237  cudnn_wrapper_.inline_cudnn_handle(),
238  ratio_,
239  const_cast<uint8_t*>(states.data<uint8_t>()),
240  states_size_in_bytes_,
241  random_seed_
242  ));
243  }
244  states_initialized_ = true;
245  }
246 
247  if (dY.dims() != cudnn_input_dims_) {
248  cudnn_input_dims_ = dY.dims();
249  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
250  data_desc_,
251  GetCudnnTensorFormat(StorageOrder::NCHW),
252  cudnnTypeWrapper<T>::type,
253  size_prod,
254  1,
255  1,
256  1));
257 
258  // get the reserve space we need
259  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
260  data_desc_, &reserve_space_size_in_bytes_));
261 
262  }
263 
264  // run the computation
265  void* mask_data = const_cast<void*>(mask.raw_data());
266  CUDNN_ENFORCE(cudnnDropoutBackward(
267  cudnn_wrapper_.inline_cudnn_handle(),
268  dropout_desc_,
269  data_desc_,
270  dY.data<T>(),
271  data_desc_,
272  dX->template mutable_data<T>(),
273  mask_data,
274  reserve_space_size_in_bytes_));
275  return true;
276 }
277 
278 bool CuDNNDropoutGradientOp::RunOnDevice() {
279  // dispatch based on contents of tensor(s)
280  const auto& dY = Input(0);
281  auto* dX = Output(0);
282 
283  dX->ResizeLike(dY);
284 
285  if (dY.IsType<float>()) {
286  return DoRunWithType<float, float>();
287  } else if (dY.IsType<float16>()) {
288  return DoRunWithType<float16, float>();
289  }
290  return false;
291 }
292 
293 namespace {
294 REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
295 REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
296 }
297 
298 #endif
299 
300 }; // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:183
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...