Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.cc
1 #include "caffe2/core/common_gpu.h"
2 
3 #include <atomic>
4 #include <cstdlib>
5 #include <sstream>
6 
7 #include "caffe2/core/asan.h"
8 #include "caffe2/core/common.h"
9 #include "caffe2/core/init.h"
10 #include "caffe2/core/logging.h"
11 
12 CAFFE2_DEFINE_bool(
13  caffe2_cuda_full_device_control,
14  false,
15  "If true, assume all the cudaSetDevice and cudaGetDevice calls will be "
16  "controlled by Caffe2, and non-Caffe2 code will ensure that the entry and "
17  "exit point has the same cuda device. Under the hood, Caffe2 will use "
18  "thread local variables to cache the device, in order to speed up set and "
19  "get device calls. This is an experimental feature that may have non "
20  "trivial side effects, so use it with care and only enable it if you are "
21  "absolutely sure. Also, this flag should not be changed after the program "
22  "initializes.");
23 
24 namespace caffe2 {
25 
27  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
28  static bool first = true;
29  if (first) {
30  first = false;
31  std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
32  << std::endl;
33  }
34  }
35  static int count = -1;
36  if (count < 0) {
37  auto err = cudaGetDeviceCount(&count);
38  switch (err) {
39  case cudaSuccess:
40  // Everything is good.
41  break;
42  case cudaErrorNoDevice:
43  count = 0;
44  break;
45  case cudaErrorInsufficientDriver:
46  LOG(WARNING) << "Insufficient cuda driver. Cannot use cuda.";
47  count = 0;
48  break;
49  case cudaErrorInitializationError:
50  LOG(WARNING) << "Cuda driver initialization failed, you might not "
51  "have a cuda gpu.";
52  count = 0;
53  break;
54  case cudaErrorUnknown:
55  LOG(ERROR) << "Found an unknown error - this may be due to an "
56  "incorrectly set up environment, e.g. changing env "
57  "variable CUDA_VISIBLE_DEVICES after program start. "
58  "I will set the available devices to be zero.";
59  count = 0;
60  break;
61  case cudaErrorMemoryAllocation:
62 #if CAFFE2_ASAN_ENABLED
63  // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
64  // pop up.
65  LOG(ERROR) << "It is known that CUDA does not work well with ASAN. As "
66  "a result we will simply shut down CUDA support. If you "
67  "would like to use GPUs, turn off ASAN.";
68  count = 0;
69  break;
70 #else // CAFFE2_ASAN_ENABLED
71  // If we are not in ASAN mode and we get cudaErrorMemoryAllocation,
72  // this means that something is wrong before NumCudaDevices() call.
73  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
74  "some cuda functions before calling NumCudaDevices() "
75  "that might have already set an error? Error: "
76  << err;
77  break;
78 #endif // CAFFE2_ASAN_ENABLED
79  default:
80  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
81  "some cuda functions before calling NumCudaDevices() "
82  "that might have already set an error? Error: "
83  << err;
84  }
85  }
86  return count;
87 }
88 
89 namespace {
90 int gDefaultGPUID = 0;
91 // Only used when FLAGS_caffe2_cuda_full_device_control is set true.
92 thread_local int gCurrentDevice = -1;
93 } // namespace
94 
95 void SetDefaultGPUID(const int deviceid) {
96  CAFFE_ENFORCE_LT(
97  deviceid,
99  "The default gpu id should be smaller than the number of gpus "
100  "on this machine: ",
101  deviceid,
102  " vs ",
103  NumCudaDevices());
104  gDefaultGPUID = deviceid;
105 }
106 
107 int GetDefaultGPUID() { return gDefaultGPUID; }
108 
110  if (FLAGS_caffe2_cuda_full_device_control) {
111  if (gCurrentDevice < 0) {
112  CUDA_ENFORCE(cudaGetDevice(&gCurrentDevice));
113  }
114  return gCurrentDevice;
115  } else {
116  int gpu_id = 0;
117  CUDA_ENFORCE(cudaGetDevice(&gpu_id));
118  return gpu_id;
119  }
120 }
121 
122 void CaffeCudaSetDevice(const int id) {
123  if (FLAGS_caffe2_cuda_full_device_control) {
124  if (gCurrentDevice != id) {
125  CUDA_ENFORCE(cudaSetDevice(id));
126  }
127  gCurrentDevice = id;
128  } else {
129  CUDA_ENFORCE(cudaSetDevice(id));
130  }
131 }
132 
133 int GetGPUIDForPointer(const void* ptr) {
134  cudaPointerAttributes attr;
135  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
136 
137  if (err == cudaErrorInvalidValue) {
138  // Occurs when the pointer is in the CPU address space that is
139  // unmanaged by CUDA; make sure the last error state is cleared,
140  // since it is persistent
141  err = cudaGetLastError();
142  CHECK(err == cudaErrorInvalidValue);
143  return -1;
144  }
145 
146  // Otherwise, there must be no error
147  CUDA_ENFORCE(err);
148 
149  if (attr.memoryType == cudaMemoryTypeHost) {
150  return -1;
151  }
152 
153  return attr.device;
154 }
155 
158  for (int i = 0; i < NumCudaDevices(); ++i) {
159  CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
160  }
161  }
162 
163  vector<cudaDeviceProp> props;
164 };
165 
166 const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
167  // According to C++11 standard section 6.7, static local variable init is
168  // thread safe. See
169  // https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
170  // for details.
171  static CudaDevicePropWrapper props;
172  CAFFE_ENFORCE_LT(
173  deviceid,
174  NumCudaDevices(),
175  "The gpu id should be smaller than the number of gpus ",
176  "on this machine: ",
177  deviceid,
178  " vs ",
179  NumCudaDevices());
180  return props.props[deviceid];
181 }
182 
183 void DeviceQuery(const int device) {
184  const cudaDeviceProp& prop = GetDeviceProperty(device);
185  std::stringstream ss;
186  ss << std::endl;
187  ss << "Device id: " << device << std::endl;
188  ss << "Major revision number: " << prop.major << std::endl;
189  ss << "Minor revision number: " << prop.minor << std::endl;
190  ss << "Name: " << prop.name << std::endl;
191  ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
192  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
193  << std::endl;
194  ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
195  ss << "Warp size: " << prop.warpSize << std::endl;
196  ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
197  ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
198  << std::endl;
199  ss << "Maximum dimension of block: "
200  << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
201  << prop.maxThreadsDim[2] << std::endl;
202  ss << "Maximum dimension of grid: "
203  << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
204  << prop.maxGridSize[2] << std::endl;
205  ss << "Clock rate: " << prop.clockRate << std::endl;
206  ss << "Total constant memory: " << prop.totalConstMem << std::endl;
207  ss << "Texture alignment: " << prop.textureAlignment << std::endl;
208  ss << "Concurrent copy and execution: "
209  << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
210  ss << "Number of multiprocessors: " << prop.multiProcessorCount
211  << std::endl;
212  ss << "Kernel execution timeout: "
213  << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
214  LOG(INFO) << ss.str();
215  return;
216 }
217 
218 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
219  int gpu_count;
220  if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
221  pattern->clear();
222  pattern->resize(gpu_count, vector<bool>(gpu_count, false));
223  for (int i = 0; i < gpu_count; ++i) {
224  for (int j = 0; j < gpu_count; ++j) {
225  int can_access = true;
226  if (i != j) {
227  if (cudaDeviceCanAccessPeer(&can_access, i, j)
228  != cudaSuccess) {
229  return false;
230  }
231  }
232  (*pattern)[i][j] = static_cast<bool>(can_access);
233  }
234  }
235  return true;
236 }
237 
239  // requires CUDA 9.0 and above
240 #if CUDA_VERSION < 9000
241  return false;
242 #else
243  int device = CaffeCudaGetDevice();
244  auto& prop = GetDeviceProperty(device);
245 
246  return prop.major >= 7;
247 #endif
248 }
249 
250 const char* cublasGetErrorString(cublasStatus_t error) {
251  switch (error) {
252  case CUBLAS_STATUS_SUCCESS:
253  return "CUBLAS_STATUS_SUCCESS";
254  case CUBLAS_STATUS_NOT_INITIALIZED:
255  return "CUBLAS_STATUS_NOT_INITIALIZED";
256  case CUBLAS_STATUS_ALLOC_FAILED:
257  return "CUBLAS_STATUS_ALLOC_FAILED";
258  case CUBLAS_STATUS_INVALID_VALUE:
259  return "CUBLAS_STATUS_INVALID_VALUE";
260  case CUBLAS_STATUS_ARCH_MISMATCH:
261  return "CUBLAS_STATUS_ARCH_MISMATCH";
262  case CUBLAS_STATUS_MAPPING_ERROR:
263  return "CUBLAS_STATUS_MAPPING_ERROR";
264  case CUBLAS_STATUS_EXECUTION_FAILED:
265  return "CUBLAS_STATUS_EXECUTION_FAILED";
266  case CUBLAS_STATUS_INTERNAL_ERROR:
267  return "CUBLAS_STATUS_INTERNAL_ERROR";
268 #if CUDA_VERSION >= 6000
269  case CUBLAS_STATUS_NOT_SUPPORTED:
270  return "CUBLAS_STATUS_NOT_SUPPORTED";
271 #if CUDA_VERSION >= 6050
272  case CUBLAS_STATUS_LICENSE_ERROR:
273  return "CUBLAS_STATUS_LICENSE_ERROR";
274 #endif // CUDA_VERSION >= 6050
275 #endif // CUDA_VERSION >= 6000
276  }
277  // To suppress compiler warning.
278  return "Unrecognized cublas error string";
279 }
280 
281 const char* curandGetErrorString(curandStatus_t error) {
282  switch (error) {
283  case CURAND_STATUS_SUCCESS:
284  return "CURAND_STATUS_SUCCESS";
285  case CURAND_STATUS_VERSION_MISMATCH:
286  return "CURAND_STATUS_VERSION_MISMATCH";
287  case CURAND_STATUS_NOT_INITIALIZED:
288  return "CURAND_STATUS_NOT_INITIALIZED";
289  case CURAND_STATUS_ALLOCATION_FAILED:
290  return "CURAND_STATUS_ALLOCATION_FAILED";
291  case CURAND_STATUS_TYPE_ERROR:
292  return "CURAND_STATUS_TYPE_ERROR";
293  case CURAND_STATUS_OUT_OF_RANGE:
294  return "CURAND_STATUS_OUT_OF_RANGE";
295  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
296  return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
297  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
298  return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
299  case CURAND_STATUS_LAUNCH_FAILURE:
300  return "CURAND_STATUS_LAUNCH_FAILURE";
301  case CURAND_STATUS_PREEXISTING_FAILURE:
302  return "CURAND_STATUS_PREEXISTING_FAILURE";
303  case CURAND_STATUS_INITIALIZATION_FAILED:
304  return "CURAND_STATUS_INITIALIZATION_FAILED";
305  case CURAND_STATUS_ARCH_MISMATCH:
306  return "CURAND_STATUS_ARCH_MISMATCH";
307  case CURAND_STATUS_INTERNAL_ERROR:
308  return "CURAND_STATUS_INTERNAL_ERROR";
309  }
310  // To suppress compiler warning.
311  return "Unrecognized curand error string";
312 }
313 
314 // Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
315 // function.
316 namespace {
317 class CudaRuntimeFlagFlipper {
318  public:
319  CudaRuntimeFlagFlipper() {
320  internal::SetCudaRuntimeFlag();
321  }
322 };
323 static CudaRuntimeFlagFlipper g_flipper;
324 } // namespace
325 
326 } // namespace caffe2
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:183
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
Definition: common_gpu.cc:218
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:26
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:133
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:109
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:122
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:166
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:281
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:250
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:238