Caffe2 - C++ API
A deep learning, cross platform ML framework
core_overhead_benchmark.cc
1 
17 #include "benchmark/benchmark.h"
18 
19 #include "caffe2/core/context.h"
20 #include "caffe2/core/context_gpu.h"
21 #include "caffe2/core/operator.h"
22 
23 #define CAFFE2_SKIP_IF_NO_GPU \
24  if (!caffe2::NumCudaDevices()) { \
25  state.SkipWithError("No CUDA available, skipping benchmark."); \
26  return; \
27  }
28 
29 using namespace caffe2;
30 
31 static void BM_CUDAContextCreation(benchmark::State& state) {
32  CAFFE2_SKIP_IF_NO_GPU;
33  volatile CUDAContext context_so_we_do_initialization_work;
34  while (state.KeepRunning()) {
35  volatile CUDAContext context;
36  }
37 }
38 BENCHMARK(BM_CUDAContextCreation);
39 
40 static void BM_CUDAContextStreamAccess(benchmark::State& state) {
41  CAFFE2_SKIP_IF_NO_GPU;
42  CUDAContext context;
43  while (state.KeepRunning()) {
44  volatile cudaStream_t stream = context.cuda_stream();
45  }
46 }
47 BENCHMARK(BM_CUDAContextStreamAccess);
48 
49 static void BM_cudaGetDevice(benchmark::State& state) {
50  CAFFE2_SKIP_IF_NO_GPU;
51  int id;
52  while (state.KeepRunning()) {
53  CUDA_ENFORCE(cudaGetDevice(&id));
54  }
55 }
56 BENCHMARK(BM_cudaGetDevice);
57 
58 static void BM_cudaSetDevice(benchmark::State& state) {
59  CAFFE2_SKIP_IF_NO_GPU;
60  int total = NumCudaDevices();
61  int i = 0;
62  while (state.KeepRunning()) {
63  CUDA_ENFORCE(cudaSetDevice((i++) % total));
64  }
65 }
66 BENCHMARK(BM_cudaSetDevice);
67 
68 static void BM_cudaSetAndGetDevice(benchmark::State& state) {
69  CAFFE2_SKIP_IF_NO_GPU;
70  int total = NumCudaDevices();
71  int i = 0;
72  int id;
73  while (state.KeepRunning()) {
74  CUDA_ENFORCE(cudaSetDevice((i++) % total));
75  CUDA_ENFORCE(cudaGetDevice(&id));
76  }
77 }
78 BENCHMARK(BM_cudaSetAndGetDevice);
79 
80 static void BM_cudaSetSameDevice(benchmark::State& state) {
81  CAFFE2_SKIP_IF_NO_GPU;
82  while (state.KeepRunning()) {
83  CUDA_ENFORCE(cudaSetDevice(0));
84  }
85 }
86 BENCHMARK(BM_cudaSetSameDevice);
87 
88 static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
89  CAFFE2_SKIP_IF_NO_GPU;
90  cudaStream_t stream;
91  while (state.KeepRunning()) {
92  CUDA_ENFORCE(cudaStreamCreate(&stream));
93  CUDA_ENFORCE(cudaStreamSynchronize(stream));
94  CUDA_ENFORCE(cudaStreamDestroy(stream));
95  }
96 }
97 BENCHMARK(BM_cudaStreamCreateSyncDelete);
98 
99 static void BM_cudaStreamSynchronize(benchmark::State& state) {
100  CAFFE2_SKIP_IF_NO_GPU;
101  cudaStream_t stream;
102  CUDA_ENFORCE(cudaStreamCreate(&stream));
103  while (state.KeepRunning()) {
104  CUDA_ENFORCE(cudaStreamSynchronize(stream));
105  }
106 }
107 BENCHMARK(BM_cudaStreamSynchronize);
108 
109 static void BM_cudaEventRecord(benchmark::State& state) {
110  CAFFE2_SKIP_IF_NO_GPU;
111  cudaStream_t stream;
112  cudaEvent_t event;
113  CUDA_ENFORCE(cudaStreamCreate(&stream));
114  CUDA_ENFORCE(cudaEventCreateWithFlags(
115  &event, cudaEventDefault | cudaEventDisableTiming));
116  while (state.KeepRunning()) {
117  CUDA_ENFORCE(cudaEventRecord(event, stream));
118  }
119 }
120 BENCHMARK(BM_cudaEventRecord);
121 
122 static void BM_cudaStreamWaitEventThenStreamSynchronize(
123  benchmark::State& state) {
124  CAFFE2_SKIP_IF_NO_GPU;
125  cudaStream_t stream;
126  cudaEvent_t event;
127  CUDA_ENFORCE(cudaStreamCreate(&stream));
128  CUDA_ENFORCE(cudaEventCreateWithFlags(
129  &event, cudaEventDefault | cudaEventDisableTiming));
130  CUDA_ENFORCE(cudaEventRecord(event, stream));
131  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
132  CUDA_ENFORCE(cudaStreamSynchronize(stream));
133  while (state.KeepRunning()) {
134  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
135  CUDA_ENFORCE(cudaStreamSynchronize(stream));
136  }
137 }
138 BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
139 
140 static void BM_CudaPointerAffinity(benchmark::State& state) {
141  CAFFE2_SKIP_IF_NO_GPU;
142  TensorCUDA tensor(vector<TIndex>{1, 2, 3, 4});
143  float* ptr = tensor.mutable_data<float>();
144  while (state.KeepRunning()) {
145  volatile int id = GetGPUIDForPointer(ptr);
146  }
147 }
148 BENCHMARK(BM_CudaPointerAffinity);
149 
150 namespace {
151 template <class Context>
152 class DummyEmptyOp : public Operator<Context> {
153  public:
154  DummyEmptyOp(const OperatorDef& def, Workspace* ws)
155  : Operator<Context>(def, ws) {}
156 
157  bool RunOnDevice() final { return true; }
158 };
159 
160 REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
161 REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
162 OPERATOR_SCHEMA(DummyEmpty);
163 } // namespace
164 
165 static void BM_OperatorCreationCPU(benchmark::State& state) {
166  std::unique_ptr<OperatorBase> op;
167  OperatorDef def;
168  Workspace ws;
169  def.set_type("DummyEmpty");
170  def.mutable_device_option()->set_device_type(CPU);
171  while (state.KeepRunning()) {
172  op = CreateOperator(def, &ws);
173  }
174 }
175 BENCHMARK(BM_OperatorCreationCPU);
176 
177 static void BM_OperatorCreationCUDA(benchmark::State& state) {
178  CAFFE2_SKIP_IF_NO_GPU;
179  std::unique_ptr<OperatorBase> op;
180  OperatorDef def;
181  Workspace ws;
182  def.set_type("DummyEmpty");
183  def.mutable_device_option()->set_device_type(CUDA);
184  while (state.KeepRunning()) {
185  op = CreateOperator(def, &ws);
186  }
187 }
188 BENCHMARK(BM_OperatorCreationCUDA);
189 
190 static void BM_RawAllocDeallocCPU(benchmark::State& state) {
191  while (state.KeepRunning()) {
192  // Allocating only 1 byte in order to measure the overhead.
193  auto ptr_and_deleter = GetCPUAllocator()->New(1);
194  // Deallocate.
195  ptr_and_deleter.second(ptr_and_deleter.first);
196  }
197 }
198 BENCHMARK(BM_RawAllocDeallocCPU);
199 
200 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
201  Tensor<CPUContext> tensor;
202  // small allocation
203  tensor.Resize(32, 32);
204  while (state.KeepRunning()) {
205  CHECK(tensor.mutable_data<float>());
206  tensor.FreeMemory();
207  }
208 }
209 BENCHMARK(BM_TensorAllocDeallocCPU);
210 
211 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
212  CAFFE2_SKIP_IF_NO_GPU;
213  Tensor<CUDAContext> tensor;
214  // small allocation
215  tensor.Resize(32, 32);
216  while (state.KeepRunning()) {
217  CHECK(tensor.mutable_data<float>());
218  tensor.FreeMemory();
219  }
220 }
221 BENCHMARK(BM_TensorAllocDeallocCUDA);
222 
223 BENCHMARK_MAIN()
T * mutable_data()
Returns a typed pointer of the underlying storage.
Definition: tensor.h:578
void FreeMemory()
Release whatever memory the tensor was holding but keep size and type information.
Definition: tensor.h:353
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:26
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:288
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:133