1 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_ 2 #define CAFFE2_CORE_CONTEXT_GPU_H_ 7 #include "caffe2/core/common_cudnn.h" 8 #include "caffe2/core/common_gpu.h" 9 #include "caffe2/core/context.h" 10 #include "caffe2/core/logging.h" 11 #include "caffe2/core/numa.h" 12 #include "caffe2/core/tensor.h" 13 #include "caffe2/core/types.h" 14 #include "caffe2/proto/caffe2.pb.h" 18 enum class CudaMemoryPoolType {
44 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
45 cuda_streams_[i] = vector<cudaStream_t>();
46 cublas_handles_[i] = vector<cublasHandle_t>();
47 cudnn_handles_[i] = vector<cudnnHandle_t>();
51 cudaStream_t GetStream(
int gpu,
int stream_id) {
52 vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
53 if (gpu_streams.size() <= stream_id) {
54 gpu_streams.resize(stream_id + 1,
nullptr);
56 if (!gpu_streams[stream_id]) {
58 CUDA_ENFORCE(cudaStreamCreateWithFlags(
59 &gpu_streams[stream_id], cudaStreamNonBlocking));
61 return gpu_streams[stream_id];
64 cublasHandle_t GetHandle(
int gpu,
int stream_id) {
66 vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
67 if (gpu_handles.size() <= stream_id) {
68 gpu_handles.resize(stream_id + 1,
nullptr);
70 if (!gpu_handles[stream_id]) {
71 CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
75 CUBLAS_ENFORCE(cublasSetPointerMode(
76 gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
78 cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
80 return gpu_handles[stream_id];
83 cudnnHandle_t GetCudnnHandle(
int gpu,
int stream_id) {
85 vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
86 if (gpu_handles.size() <= stream_id) {
87 gpu_handles.resize(stream_id + 1,
nullptr);
89 if (!gpu_handles[stream_id]) {
90 CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
92 cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
94 return gpu_handles[stream_id];
98 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
99 for (
auto& handle : cublas_handles_[i]) {
101 CUBLAS_CHECK(cublasDestroy(handle));
104 for (
auto& stream : cuda_streams_[i]) {
106 CUDA_CHECK(cudaStreamDestroy(stream));
109 for (
auto& handle : cudnn_handles_[i]) {
111 CUDNN_CHECK(cudnnDestroy(handle));
116 vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
117 vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
118 vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
128 if (curand_generator_) {
129 CURAND_CHECK(curandDestroyGenerator(curand_generator_));
131 FinishDeviceComputation();
134 inline void SwitchToDevice(
int stream_id) {
135 set_stream_id(stream_id);
138 inline void SwitchToDevice() {
142 inline void WaitEvent(
const Event& ev) {
146 inline void Record(
Event* ev,
const char* err_msg =
nullptr)
const {
147 CAFFE_ENFORCE(ev,
"Event must not be null.");
148 ev->Record(CUDA,
this, err_msg);
151 void FinishDeviceComputation() {
152 cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
153 cudaError_t error = cudaGetLastError();
154 if (error != cudaSuccess) {
155 CAFFE_THROW(
"Encountered CUDA error: ", cudaGetErrorString(error));
159 inline int cuda_gpu_id()
const {
163 inline cudaStream_t cuda_stream() {
164 return cuda_stream(gpu_id_, stream_id_);
167 inline cudaStream_t cuda_stream()
const {
168 return cuda_stream(gpu_id_, stream_id_);
171 static cudaStream_t cuda_stream(
int gpu_id,
int stream_id) {
172 return cuda_objects_.GetStream(gpu_id, stream_id);
175 cublasHandle_t cublas_handle() {
176 return cuda_objects_.GetHandle(gpu_id_, stream_id_);
179 cudnnHandle_t cudnn_handle() {
180 return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
183 curandGenerator_t& curand_generator() {
184 if (!curand_generator_) {
187 curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
189 curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
190 CHECK_NOTNULL(curand_generator_);
192 CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
193 return curand_generator_;
196 static std::pair<void*, MemoryDeleter> New(
size_t nbytes);
201 static std::mutex& mutex();
205 static std::vector<long> TotalMemoryByGpu();
206 static std::vector<long> MaxMemoryByGpu();
208 template <
class SrcContext,
class DstContext>
209 inline void CopyBytes(
size_t nbytes,
const void* src,
void* dst) {
210 CUDA_ENFORCE(cudaMemcpyAsync(
215 cuda_objects_.GetStream(gpu_id_, stream_id_)));
218 template <
typename T,
class SrcContext,
class DstContext>
219 inline void Copy(
int n,
const T* src, T* dst) {
220 CopyBytes<SrcContext, DstContext>(n *
sizeof(T),
221 static_cast<const void*>(src),
222 static_cast<void*
>(dst));
225 template <
class SrcContext,
class DstContext>
227 CopyItems(
const TypeMeta& meta,
size_t n,
const void* src,
void* dst) {
228 CAFFE_ENFORCE(!meta.
copy(),
"CUDAContext requires fundamental types.");
229 CopyBytes<SrcContext, DstContext>(n * meta.
itemsize(), src, dst);
233 static bool HasAsyncPartDefault() {
237 static bool SupportsAsyncScheduling() {
241 static bool IsStreamFree(
const DeviceOption& option,
int stream_id) {
242 auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
243 return cudaStreamQuery(stream) == cudaSuccess;
247 static void Delete(
void* data);
248 void set_stream_id(
int stream_id) {
249 stream_id_ = stream_id;
255 curandGenerator_t curand_generator_{
nullptr};
265 inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
266 size_t nbytes,
const void* src,
void* dst) {
271 inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
272 size_t nbytes,
const void* src,
void* dst) {
289 std::pair<void*, MemoryDeleter> New(
size_t nbytes)
override {
291 std::lock_guard<std::mutex> lock(CUDAContext::mutex());
292 if (IsNUMAEnabled()) {
293 auto ptr_and_deleter = baseAllocator_.New(nbytes);
294 data = ptr_and_deleter.first;
296 CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
298 CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
300 memset(data, 0, nbytes);
301 return {data, Delete};
304 MemoryDeleter GetDeleter()
override {
309 static void Delete(
void* data) {
315 std::lock_guard<std::mutex> lock(CUDAContext::mutex());
316 if (IsNUMAEnabled()) {
317 CUDA_ENFORCE(cudaHostUnregister(data));
318 DefaultCPUAllocator::Delete(data);
320 cudaError_t err = cudaFreeHost(data);
321 if (err == cudaErrorInvalidValue) {
340 #endif // CAFFE2_CORE_CONTEXT_GPU_H_ An allocator that does the CPU memory allocation with pinned memory.
A struct to host thread-local cuda objects.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.