Caffe2 - C++ API
A deep learning, cross platform ML framework
context_gpu.h
1 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_
2 #define CAFFE2_CORE_CONTEXT_GPU_H_
3 
4 #include <ctime>
5 #include <mutex>
6 
7 #include "caffe2/core/common_cudnn.h"
8 #include "caffe2/core/common_gpu.h"
9 #include "caffe2/core/context.h"
10 #include "caffe2/core/logging.h"
11 #include "caffe2/core/numa.h"
12 #include "caffe2/core/tensor.h"
13 #include "caffe2/core/types.h"
14 #include "caffe2/proto/caffe2.pb.h"
15 
16 namespace caffe2 {
17 
18 enum class CudaMemoryPoolType {
19  NONE = 0,
20  CUB = 1,
21 };
22 
28 CudaMemoryPoolType GetCudaMemoryPoolType();
29 
40  friend class CUDAContext;
41 
42  private:
44  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
45  cuda_streams_[i] = vector<cudaStream_t>();
46  cublas_handles_[i] = vector<cublasHandle_t>();
47  cudnn_handles_[i] = vector<cudnnHandle_t>();
48  }
49  }
50 
51  cudaStream_t GetStream(int gpu, int stream_id) {
52  vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
53  if (gpu_streams.size() <= stream_id) {
54  gpu_streams.resize(stream_id + 1, nullptr);
55  }
56  if (!gpu_streams[stream_id]) {
57  DeviceGuard guard(gpu);
58  CUDA_ENFORCE(cudaStreamCreateWithFlags(
59  &gpu_streams[stream_id], cudaStreamNonBlocking));
60  }
61  return gpu_streams[stream_id];
62  }
63 
64  cublasHandle_t GetHandle(int gpu, int stream_id) {
65  DeviceGuard guard(gpu);
66  vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
67  if (gpu_handles.size() <= stream_id) {
68  gpu_handles.resize(stream_id + 1, nullptr);
69  }
70  if (!gpu_handles[stream_id]) {
71  CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
72  // The default is CUBLAS_POINTER_MODE_HOST. You can override
73  // it after obtaining the cublas handle, but do that with
74  // caution.
75  CUBLAS_ENFORCE(cublasSetPointerMode(
76  gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
77  CUBLAS_ENFORCE(
78  cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
79  }
80  return gpu_handles[stream_id];
81  }
82 
83  cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
84  DeviceGuard guard(gpu);
85  vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
86  if (gpu_handles.size() <= stream_id) {
87  gpu_handles.resize(stream_id + 1, nullptr);
88  }
89  if (!gpu_handles[stream_id]) {
90  CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
91  CUDNN_ENFORCE(
92  cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
93  }
94  return gpu_handles[stream_id];
95  }
96 
97  ~ThreadLocalCUDAObjects() noexcept {
98  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
99  for (auto& handle : cublas_handles_[i]) {
100  if (handle) {
101  CUBLAS_CHECK(cublasDestroy(handle));
102  }
103  }
104  for (auto& stream : cuda_streams_[i]) {
105  if (stream) {
106  CUDA_CHECK(cudaStreamDestroy(stream));
107  }
108  }
109  for (auto& handle : cudnn_handles_[i]) {
110  if (handle) {
111  CUDNN_CHECK(cudnnDestroy(handle));
112  }
113  }
114  }
115  }
116  vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
117  vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
118  vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
119 };
120 
121 class CUDAContext final {
122  public:
123  // The default cuda context constructor.
124  explicit CUDAContext(const int gpu_id = -1);
125  explicit CUDAContext(const DeviceOption& option);
126 
127  ~CUDAContext() {
128  if (curand_generator_) {
129  CURAND_CHECK(curandDestroyGenerator(curand_generator_));
130  }
131  FinishDeviceComputation();
132  }
133 
134  inline void SwitchToDevice(int stream_id) {
135  set_stream_id(stream_id);
136  CaffeCudaSetDevice(gpu_id_);
137  }
138  inline void SwitchToDevice() {
139  SwitchToDevice(0);
140  }
141 
142  inline void WaitEvent(const Event& ev) {
143  ev.Wait(CUDA, this);
144  }
145 
146  inline void Record(Event* ev, const char* err_msg = nullptr) const {
147  CAFFE_ENFORCE(ev, "Event must not be null.");
148  ev->Record(CUDA, this, err_msg);
149  }
150 
151  void FinishDeviceComputation() {
152  cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
153  cudaError_t error = cudaGetLastError();
154  if (error != cudaSuccess) {
155  CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error));
156  }
157  }
158 
159  inline int cuda_gpu_id() const {
160  return gpu_id_;
161  }
162 
163  inline cudaStream_t cuda_stream() {
164  return cuda_stream(gpu_id_, stream_id_);
165  }
166 
167  inline cudaStream_t cuda_stream() const {
168  return cuda_stream(gpu_id_, stream_id_);
169  }
170 
171  static cudaStream_t cuda_stream(int gpu_id, int stream_id) {
172  return cuda_objects_.GetStream(gpu_id, stream_id);
173  }
174 
175  cublasHandle_t cublas_handle() {
176  return cuda_objects_.GetHandle(gpu_id_, stream_id_);
177  }
178 
179  cudnnHandle_t cudnn_handle() {
180  return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
181  }
182 
183  curandGenerator_t& curand_generator() {
184  if (!curand_generator_) {
185  DeviceGuard guard(gpu_id_);
186  CURAND_ENFORCE(
187  curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
188  CURAND_ENFORCE(
189  curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
190  CHECK_NOTNULL(curand_generator_);
191  }
192  CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
193  return curand_generator_;
194  }
195 
196  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
197 
198  // Get a mutex to lock out cudaMalloc / cudaFree calls when
199  // NCCL kernels are being launched. Should remove threat of
200  // deadlocks
201  static std::mutex& mutex();
202 
203  // Functions to query memory stats. Only available if flag
204  // --caffe2_gpu_memory_tracking is enabled.
205  static std::vector<long> TotalMemoryByGpu();
206  static std::vector<long> MaxMemoryByGpu();
207 
208  template <class SrcContext, class DstContext>
209  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
210  CUDA_ENFORCE(cudaMemcpyAsync(
211  dst,
212  src,
213  nbytes,
214  cudaMemcpyDefault,
215  cuda_objects_.GetStream(gpu_id_, stream_id_)));
216  }
217 
218  template <typename T, class SrcContext, class DstContext>
219  inline void Copy(int n, const T* src, T* dst) {
220  CopyBytes<SrcContext, DstContext>(n * sizeof(T),
221  static_cast<const void*>(src),
222  static_cast<void*>(dst));
223  }
224 
225  template <class SrcContext, class DstContext>
226  inline void
227  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
228  CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
229  CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
230  }
231 
232  // By default CUDA operators have async device parts
233  static bool HasAsyncPartDefault() {
234  return true;
235  }
236 
237  static bool SupportsAsyncScheduling() {
238  return true;
239  }
240 
241  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
242  auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
243  return cudaStreamQuery(stream) == cudaSuccess;
244  }
245 
246  protected:
247  static void Delete(void* data);
248  void set_stream_id(int stream_id) {
249  stream_id_ = stream_id;
250  }
251 
252  int gpu_id_;
253  int stream_id_ = 0;
254  int random_seed_;
255  curandGenerator_t curand_generator_{nullptr};
256  static thread_local ThreadLocalCUDAObjects cuda_objects_;
257 };
258 
259 // For the CPU context, we also allow a (probably expensive) function
260 // to copy the data from a cuda context. Inside the function, we create
261 // a temporary CUDAContext object to carry out the copy. From the caller's
262 // side, these functions are synchronous with respect to the host, similar
263 // to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
264 template<>
265 inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
266  size_t nbytes, const void* src, void* dst) {
267  CUDAContext context(GetGPUIDForPointer(src));
268  context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
269 }
270 template<>
271 inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
272  size_t nbytes, const void* src, void* dst) {
273  CUDAContext context(GetGPUIDForPointer(dst));
274  context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
275 }
276 
287  PinnedCPUAllocator() {}
288  ~PinnedCPUAllocator() override {}
289  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
290  void* data;
291  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
292  if (IsNUMAEnabled()) {
293  auto ptr_and_deleter = baseAllocator_.New(nbytes);
294  data = ptr_and_deleter.first;
295  CAFFE_ENFORCE(data);
296  CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
297  } else {
298  CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
299  }
300  memset(data, 0, nbytes);
301  return {data, Delete};
302  }
303 
304  MemoryDeleter GetDeleter() override {
305  return Delete;
306  }
307 
308  private:
309  static void Delete(void* data) {
310  // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
311  // or not. If a CUDAContext::New() call is made, inside the CUDAContext
312  // function we will switch the cpu side allocator to a PinnedCPUAllocator.
313  // But, if one calls CPUContext::New() before any cuda allocations,
314  // PinnedCPUAllocator can still delete the corresponding memory.
315  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
316  if (IsNUMAEnabled()) {
317  CUDA_ENFORCE(cudaHostUnregister(data));
318  DefaultCPUAllocator::Delete(data);
319  } else {
320  cudaError_t err = cudaFreeHost(data);
321  if (err == cudaErrorInvalidValue) {
322  free(data);
323  // Calling cudaGetLastError will reset the cuda error.
324  cudaGetLastError();
325  } else {
326  // For all other errors, still do a cuda check.
327  CUDA_ENFORCE(err);
328  }
329  }
330  }
331 
332  DefaultCPUAllocator baseAllocator_;
333 };
334 
335 // For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
337 
338 } // namespace caffe2
339 
340 #endif // CAFFE2_CORE_CONTEXT_GPU_H_
An allocator that does the CPU memory allocation with pinned memory.
Definition: context_gpu.h:286
A struct to host thread-local cuda objects.
Definition: context_gpu.h:39
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
TypedCopy copy() const
Returns the typed copy function pointer for individual iterms.
Definition: typeid.h:155
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:133
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:122
TypeMeta is a thin class that allows us to store the type of a container such as a blob...
Definition: typeid.h:88
const size_t & itemsize() const
Returns the size of the item.
Definition: typeid.h:143