Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.h
1 #ifndef CAFFE2_CORE_COMMON_GPU_H_
2 #define CAFFE2_CORE_COMMON_GPU_H_
3 
4 #include <assert.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 // Disable strict aliasing errors for CUDA 9.
9 // The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
10 // It is included by cusparse.h as well, so guarding the
11 // inclusion of that header here is not enough.
12 #if CUDA_VERSION >= 9000
13 #ifdef __GNUC__
14 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
15 #pragma GCC diagnostic push
16 #endif
17 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
18 #endif // __GNUC__
19 #endif // CUDA_VERSION >= 9000
20 
21 #include <cublas_v2.h>
22 #include <curand.h>
23 #include <driver_types.h>
24 
25 #include "caffe2/core/logging.h"
26 #include "caffe2/core/common.h"
27 
28 // This is a macro defined for cuda fp16 support. In default, cuda fp16 is
29 // supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
30 // a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
31 // version here, but would also allow a use to pass in the flag
32 // CAFFE_HAS_CUDA_FP16 manually.
33 
34 #ifndef CAFFE_HAS_CUDA_FP16
35 #if CUDA_VERSION >= 7050
36 #define CAFFE_HAS_CUDA_FP16
37 #endif // CUDA_VERSION >= 7050
38 #endif // CAFFE_HAS_CUDA_FP16
39 
40 #ifdef CAFFE_HAS_CUDA_FP16
41 #include <cuda_fp16.h>
42 #endif
43 
44 // Re-enable strict aliasing diagnostic if it was disabled.
45 #if CUDA_VERSION >= 9000
46 #ifdef __GNUC__
47 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
48 #pragma GCC diagnostic pop
49 #endif
50 #endif // __GNUC__
51 #endif // CUDA_VERSION >= 9000
52 
56 #define CAFFE2_COMPILE_TIME_MAX_GPUS 16
57 
64 #define CAFFE2_CUDA_MAX_PEER_SIZE 8
65 
66 namespace caffe2 {
67 
68 #if CUDA_VERSION >= 9000
69 
72 class TensorCoreEngine {};
73 #endif
74 
78 inline int CudaVersion() { return CUDA_VERSION; }
79 
83 int NumCudaDevices();
84 
99 inline bool HasCudaGPU() { return NumCudaDevices() > 0; }
100 
104 int CaffeCudaGetDevice();
105 
109 void CaffeCudaSetDevice(const int id);
110 
114 int GetGPUIDForPointer(const void* ptr);
115 
119 const cudaDeviceProp& GetDeviceProperty(const int device);
120 
124 void DeviceQuery(const int deviceid);
125 
133 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
134 
138 bool TensorCoreAvailable();
139 
143 const char* cublasGetErrorString(cublasStatus_t error);
144 
148 const char* curandGetErrorString(curandStatus_t error);
149 
150 // CUDA: various checks for different function calls.
151 #define CUDA_ENFORCE(condition, ...) \
152  do { \
153  cudaError_t error = condition; \
154  CAFFE_ENFORCE_EQ( \
155  error, \
156  cudaSuccess, \
157  "Error at: ", \
158  __FILE__, \
159  ":", \
160  __LINE__, \
161  ": ", \
162  cudaGetErrorString(error), ##__VA_ARGS__); \
163  } while (0)
164 #define CUDA_CHECK(condition) \
165  do { \
166  cudaError_t error = condition; \
167  CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
168  } while (0)
169 
170 #define CUDA_DRIVERAPI_ENFORCE(condition) \
171  do { \
172  CUresult result = condition; \
173  if (result != CUDA_SUCCESS) { \
174  const char* msg; \
175  cuGetErrorName(result, &msg); \
176  CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
177  } \
178  } while (0)
179 #define CUDA_DRIVERAPI_CHECK(condition) \
180  do { \
181  CUresult result = condition; \
182  if (result != CUDA_SUCCESS) { \
183  const char* msg; \
184  cuGetErrorName(result, &msg); \
185  LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
186  << msg; \
187  } \
188  } while (0)
189 
190 #define CUBLAS_ENFORCE(condition) \
191  do { \
192  cublasStatus_t status = condition; \
193  CAFFE_ENFORCE_EQ( \
194  status, \
195  CUBLAS_STATUS_SUCCESS, \
196  "Error at: ", \
197  __FILE__, \
198  ":", \
199  __LINE__, \
200  ": ", \
201  ::caffe2::cublasGetErrorString(status)); \
202  } while (0)
203 #define CUBLAS_CHECK(condition) \
204  do { \
205  cublasStatus_t status = condition; \
206  CHECK(status == CUBLAS_STATUS_SUCCESS) \
207  << ::caffe2::cublasGetErrorString(status); \
208  } while (0)
209 
210 #define CURAND_ENFORCE(condition) \
211  do { \
212  curandStatus_t status = condition; \
213  CAFFE_ENFORCE_EQ( \
214  status, \
215  CURAND_STATUS_SUCCESS, \
216  "Error at: ", \
217  __FILE__, \
218  ":", \
219  __LINE__, \
220  ": ", \
221  ::caffe2::curandGetErrorString(status)); \
222  } while (0)
223 #define CURAND_CHECK(condition) \
224  do { \
225  curandStatus_t status = condition; \
226  CHECK(status == CURAND_STATUS_SUCCESS) \
227  << ::caffe2::curandGetErrorString(status); \
228  } while (0)
229 
230 #define CUDA_1D_KERNEL_LOOP(i, n) \
231  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
232  i += blockDim.x * gridDim.x)
233 
234 // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
235 // kernels. This is not supported by Apple platforms so we special case it.
236 // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
237 #ifdef __APPLE__
238 #define CUDA_KERNEL_ASSERT(...)
239 #else // __APPLE__
240 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
241 #endif // __APPLE__
242 
243 // The following helper functions are here so that you can write a kernel call
244 // when you are not particularly interested in maxing out the kernels'
245 // performance. Usually, this will give you a reasonable speed, but if you
246 // really want to find the best performance, it is advised that you tune the
247 // size of the blocks and grids more reasonably.
248 // A legacy note: this is derived from the old good Caffe days, when I simply
249 // hard-coded the number of threads and wanted to keep backward compatibility
250 // for different computation capabilities.
251 // For more info on CUDA compute capabilities, visit the NVidia website at:
252 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
253 
254 // The number of cuda threads to use. 512 is used for backward compatibility,
255 // and it is observed that setting it to 1024 usually does not bring much
256 // performance gain (which makes sense, because warp size being 32 means that
257 // blindly setting a huge block for a random kernel isn't optimal).
258 constexpr int CAFFE_CUDA_NUM_THREADS = 512;
259 // The maximum number of blocks to use in the default kernel call. We set it to
260 // 4096 which would work for compute capability 2.x (where 65536 is the limit).
261 // This number is very carelessly chosen. Ideally, one would like to look at
262 // the hardware at runtime, and pick the number of blocks that makes most
263 // sense for the specific runtime environment. This is a todo item.
264 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
265 
269 inline int CAFFE_GET_BLOCKS(const int N) {
270  return std::max(
271  std::min(
272  (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
273  CAFFE_MAXIMUM_NUM_BLOCKS),
274  // Use at least 1 block, since CUDA does not allow empty block
275  1);
276 }
277 
278 class DeviceGuard {
279  public:
280  explicit DeviceGuard(int newDevice) : previous_(CaffeCudaGetDevice()) {
281  if (previous_ != newDevice) {
282  CaffeCudaSetDevice(newDevice);
283  }
284  }
285 
286  ~DeviceGuard() noexcept {
287  CaffeCudaSetDevice(previous_);
288  }
289 
290  private:
291  int previous_;
292 };
293 
294 } // namespace caffe2
295 #endif // CAFFE2_CORE_COMMON_GPU_H_
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:183
int CudaVersion()
A runtime function to report the cuda version that Caffe2 is built with.
Definition: common_gpu.h:78
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
Definition: common_gpu.h:99
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
Definition: common_gpu.cc:218
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:26
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:133
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:109
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:122
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:166
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:281
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:269
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:250
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:238