1 #include "caffe2/core/common_gpu.h" 7 #include "caffe2/core/asan.h" 8 #include "caffe2/core/common.h" 9 #include "caffe2/core/init.h" 10 #include "caffe2/core/logging.h" 13 caffe2_cuda_full_device_control,
15 "If true, assume all the cudaSetDevice and cudaGetDevice calls will be " 16 "controlled by Caffe2, and non-Caffe2 code will ensure that the entry and " 17 "exit point has the same cuda device. Under the hood, Caffe2 will use " 18 "thread local variables to cache the device, in order to speed up set and " 19 "get device calls. This is an experimental feature that may have non " 20 "trivial side effects, so use it with care and only enable it if you are " 21 "absolutely sure. Also, this flag should not be changed after the program " 27 if (getenv(
"CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
28 static bool first =
true;
31 std::cerr <<
"DEBUG: caffe2::NumCudaDevices() invoked for the first time" 35 static int count = -1;
37 auto err = cudaGetDeviceCount(&count);
42 case cudaErrorNoDevice:
45 case cudaErrorInsufficientDriver:
46 LOG(WARNING) <<
"Insufficient cuda driver. Cannot use cuda.";
49 case cudaErrorInitializationError:
50 LOG(WARNING) <<
"Cuda driver initialization failed, you might not " 54 case cudaErrorUnknown:
55 LOG(ERROR) <<
"Found an unknown error - this may be due to an " 56 "incorrectly set up environment, e.g. changing env " 57 "variable CUDA_VISIBLE_DEVICES after program start. " 58 "I will set the available devices to be zero.";
61 case cudaErrorMemoryAllocation:
62 #if CAFFE2_ASAN_ENABLED 65 LOG(ERROR) <<
"It is known that CUDA does not work well with ASAN. As " 66 "a result we will simply shut down CUDA support. If you " 67 "would like to use GPUs, turn off ASAN.";
70 #else // CAFFE2_ASAN_ENABLED 73 LOG(FATAL) <<
"Unexpected error from cudaGetDeviceCount(). Did you run " 74 "some cuda functions before calling NumCudaDevices() " 75 "that might have already set an error? Error: " 78 #endif // CAFFE2_ASAN_ENABLED 80 LOG(FATAL) <<
"Unexpected error from cudaGetDeviceCount(). Did you run " 81 "some cuda functions before calling NumCudaDevices() " 82 "that might have already set an error? Error: " 90 int gDefaultGPUID = 0;
92 thread_local
int gCurrentDevice = -1;
95 void SetDefaultGPUID(
const int deviceid) {
99 "The default gpu id should be smaller than the number of gpus " 104 gDefaultGPUID = deviceid;
107 int GetDefaultGPUID() {
return gDefaultGPUID; }
110 if (FLAGS_caffe2_cuda_full_device_control) {
111 if (gCurrentDevice < 0) {
112 CUDA_ENFORCE(cudaGetDevice(&gCurrentDevice));
114 return gCurrentDevice;
117 CUDA_ENFORCE(cudaGetDevice(&gpu_id));
123 if (FLAGS_caffe2_cuda_full_device_control) {
124 if (gCurrentDevice !=
id) {
125 CUDA_ENFORCE(cudaSetDevice(
id));
129 CUDA_ENFORCE(cudaSetDevice(
id));
134 cudaPointerAttributes attr;
135 cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
137 if (err == cudaErrorInvalidValue) {
141 err = cudaGetLastError();
142 CHECK(err == cudaErrorInvalidValue);
149 if (attr.memoryType == cudaMemoryTypeHost) {
159 CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
163 vector<cudaDeviceProp> props;
175 "The gpu id should be smaller than the number of gpus ",
180 return props.props[deviceid];
185 std::stringstream ss;
187 ss <<
"Device id: " << device << std::endl;
188 ss <<
"Major revision number: " << prop.major << std::endl;
189 ss <<
"Minor revision number: " << prop.minor << std::endl;
190 ss <<
"Name: " << prop.name << std::endl;
191 ss <<
"Total global memory: " << prop.totalGlobalMem << std::endl;
192 ss <<
"Total shared memory per block: " << prop.sharedMemPerBlock
194 ss <<
"Total registers per block: " << prop.regsPerBlock << std::endl;
195 ss <<
"Warp size: " << prop.warpSize << std::endl;
196 ss <<
"Maximum memory pitch: " << prop.memPitch << std::endl;
197 ss <<
"Maximum threads per block: " << prop.maxThreadsPerBlock
199 ss <<
"Maximum dimension of block: " 200 << prop.maxThreadsDim[0] <<
", " << prop.maxThreadsDim[1] <<
", " 201 << prop.maxThreadsDim[2] << std::endl;
202 ss <<
"Maximum dimension of grid: " 203 << prop.maxGridSize[0] <<
", " << prop.maxGridSize[1] <<
", " 204 << prop.maxGridSize[2] << std::endl;
205 ss <<
"Clock rate: " << prop.clockRate << std::endl;
206 ss <<
"Total constant memory: " << prop.totalConstMem << std::endl;
207 ss <<
"Texture alignment: " << prop.textureAlignment << std::endl;
208 ss <<
"Concurrent copy and execution: " 209 << (prop.deviceOverlap ?
"Yes" :
"No") << std::endl;
210 ss <<
"Number of multiprocessors: " << prop.multiProcessorCount
212 ss <<
"Kernel execution timeout: " 213 << (prop.kernelExecTimeoutEnabled ?
"Yes" :
"No") << std::endl;
214 LOG(INFO) << ss.str();
220 if (cudaGetDeviceCount(&gpu_count) != cudaSuccess)
return false;
222 pattern->resize(gpu_count, vector<bool>(gpu_count,
false));
223 for (
int i = 0; i < gpu_count; ++i) {
224 for (
int j = 0; j < gpu_count; ++j) {
225 int can_access =
true;
227 if (cudaDeviceCanAccessPeer(&can_access, i, j)
232 (*pattern)[i][j] =
static_cast<bool>(can_access);
240 #if CUDA_VERSION < 9000 246 return prop.major >= 7;
252 case CUBLAS_STATUS_SUCCESS:
253 return "CUBLAS_STATUS_SUCCESS";
254 case CUBLAS_STATUS_NOT_INITIALIZED:
255 return "CUBLAS_STATUS_NOT_INITIALIZED";
256 case CUBLAS_STATUS_ALLOC_FAILED:
257 return "CUBLAS_STATUS_ALLOC_FAILED";
258 case CUBLAS_STATUS_INVALID_VALUE:
259 return "CUBLAS_STATUS_INVALID_VALUE";
260 case CUBLAS_STATUS_ARCH_MISMATCH:
261 return "CUBLAS_STATUS_ARCH_MISMATCH";
262 case CUBLAS_STATUS_MAPPING_ERROR:
263 return "CUBLAS_STATUS_MAPPING_ERROR";
264 case CUBLAS_STATUS_EXECUTION_FAILED:
265 return "CUBLAS_STATUS_EXECUTION_FAILED";
266 case CUBLAS_STATUS_INTERNAL_ERROR:
267 return "CUBLAS_STATUS_INTERNAL_ERROR";
268 #if CUDA_VERSION >= 6000 269 case CUBLAS_STATUS_NOT_SUPPORTED:
270 return "CUBLAS_STATUS_NOT_SUPPORTED";
271 #if CUDA_VERSION >= 6050 272 case CUBLAS_STATUS_LICENSE_ERROR:
273 return "CUBLAS_STATUS_LICENSE_ERROR";
274 #endif // CUDA_VERSION >= 6050 275 #endif // CUDA_VERSION >= 6000 278 return "Unrecognized cublas error string";
283 case CURAND_STATUS_SUCCESS:
284 return "CURAND_STATUS_SUCCESS";
285 case CURAND_STATUS_VERSION_MISMATCH:
286 return "CURAND_STATUS_VERSION_MISMATCH";
287 case CURAND_STATUS_NOT_INITIALIZED:
288 return "CURAND_STATUS_NOT_INITIALIZED";
289 case CURAND_STATUS_ALLOCATION_FAILED:
290 return "CURAND_STATUS_ALLOCATION_FAILED";
291 case CURAND_STATUS_TYPE_ERROR:
292 return "CURAND_STATUS_TYPE_ERROR";
293 case CURAND_STATUS_OUT_OF_RANGE:
294 return "CURAND_STATUS_OUT_OF_RANGE";
295 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
296 return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
297 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
298 return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
299 case CURAND_STATUS_LAUNCH_FAILURE:
300 return "CURAND_STATUS_LAUNCH_FAILURE";
301 case CURAND_STATUS_PREEXISTING_FAILURE:
302 return "CURAND_STATUS_PREEXISTING_FAILURE";
303 case CURAND_STATUS_INITIALIZATION_FAILED:
304 return "CURAND_STATUS_INITIALIZATION_FAILED";
305 case CURAND_STATUS_ARCH_MISMATCH:
306 return "CURAND_STATUS_ARCH_MISMATCH";
307 case CURAND_STATUS_INTERNAL_ERROR:
308 return "CURAND_STATUS_INTERNAL_ERROR";
311 return "Unrecognized curand error string";
317 class CudaRuntimeFlagFlipper {
319 CudaRuntimeFlagFlipper() {
320 internal::SetCudaRuntimeFlag();
323 static CudaRuntimeFlagFlipper g_flipper;
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
int NumCudaDevices()
Returns the number of devices.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
int CaffeCudaGetDevice()
Gets the current GPU id.
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.