Caffe2 - C++ API
A deep learning, cross platform ML framework
TestGLConvolution.cc
1 
2 #include "caffe2/core/operator.h"
3 #include "caffe2/core/timer.h"
4 #include "caffe2/core/workspace.h"
5 #include "caffe2/utils/math.h"
6 
7 #include "../core/GL.h"
8 #include "../core/GLLogging.h"
9 #include "../core/arm_neon_support.h"
10 #include "../operators/gl_tiling_utils.h"
11 #include "TestGLConvolution.h"
12 
13 #include <vector>
14 
15 void AddNoiseInput(const std::vector<caffe2::TIndex>& shape,
16  const std::string& name,
17  caffe2::Workspace* ws) {
18  caffe2::CPUContext context;
19  caffe2::Blob* blob = ws->CreateBlob(name);
20  auto* tensor = blob->GetMutable<caffe2::TensorCPU>();
21  tensor->Resize(shape);
22 
23  caffe2::math::RandGaussian<float, caffe2::CPUContext>(
24  tensor->size(), 0.0f, 10.0f, tensor->mutable_data<float>(), &context);
25 }
26 
27 double BenchOp(const std::string& typ,
28  int inputC,
29  int outputC,
30  int kW,
31  int kH,
32  int stride,
33  int inW,
34  int inH,
35  bool transposed,
36  caffe2::Workspace* ws = nullptr) {
37  caffe2::Workspace localWs;
38  if (!ws) {
39  ws = &localWs;
40  }
41 
42  const char* engine = transposed ? "MOBILE" : "NNPACK";
43 
44  caffe2::OperatorDef def1;
45  def1.set_name("test");
46  def1.set_type(typ);
47  def1.set_engine(engine);
48  def1.add_input("X");
49  def1.add_input("W");
50  def1.add_input("B");
51  def1.add_output("Y");
52 
53  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_h", kH));
54  def1.add_arg()->CopyFrom(caffe2::MakeArgument("kernel_w", kW));
55  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_h", stride));
56  def1.add_arg()->CopyFrom(caffe2::MakeArgument("stride_w", stride));
57  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_t", 0));
58  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_l", 0));
59  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_b", 0));
60  def1.add_arg()->CopyFrom(caffe2::MakeArgument("pad_r", 0));
61  def1.add_arg()->CopyFrom(caffe2::MakeArgument("convolution_transform_strategy", std::string("PRECOMPUTE")));
62 
63  AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW}, "X", ws);
64  if (transposed) {
65  AddNoiseInput(std::vector<caffe2::TIndex>{inputC, outputC, kH, kW}, "W", ws);
66  } else {
67  AddNoiseInput(std::vector<caffe2::TIndex>{outputC, inputC, kH, kW}, "W", ws);
68  }
69  AddNoiseInput(std::vector<caffe2::TIndex>{outputC}, "B", ws);
70 
71  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
72 
73  // Measure one iteration
74  caffe2::Timer timer;
75  timer.Start();
76 
77  op1->Run();
78 
79  float one_iteration = timer.MilliSeconds();
80 
81  int target_iterations = std::max((int)(1000 / one_iteration), 1);
82  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
83 
84  // warm up
85  for (int i = 0; i < warmup_iterations; i++) {
86  op1->Run();
87  }
88 
89  timer.Start();
90 
91  int runs = target_iterations;
92  for (int i = 0; i < runs; i++) {
93  op1->Run();
94  }
95 
96  auto total_t = timer.MilliSeconds();
97 
98  gl_log(GL_LOG,
99  "%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
100  typ.c_str(),
101  inputC,
102  outputC,
103  inW,
104  inH,
105  kW,
106  kH,
107  engine,
108  timer.MilliSeconds() / (float)runs);
109  return double(total_t) / runs;
110 }
111 
112 template <typename T>
113 static double BenchGLConvolution(int input_channels,
114  int output_channels,
115  int kernel_width,
116  int kernel_height,
117  int input_width,
118  int input_height,
119  int input_padding,
120  int input_stride,
121  bool transposed,
122  caffe2::Workspace* ws = nullptr) {
123  int tile_x = 1, tile_y = 1;
124  caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
125 
126  gl_log(GL_LOG, "Input Tiles Factors: %d, %d\n", tile_x, tile_y);
127 
128  caffe2::Workspace localWs;
129  if (!ws) {
130  ws = &localWs;
131  }
132 
133  AddNoiseInput(
134  std::vector<caffe2::TIndex>{1, input_channels, input_height, input_width}, "X_cpu", ws);
135  if (transposed) {
136  AddNoiseInput(
137  std::vector<caffe2::TIndex>{input_channels, output_channels, kernel_height, kernel_width},
138  "W",
139  ws);
140  } else {
141  AddNoiseInput(
142  std::vector<caffe2::TIndex>{output_channels, input_channels, kernel_height, kernel_width},
143  "W",
144  ws);
145  }
146  AddNoiseInput(std::vector<caffe2::TIndex>{output_channels}, "b", ws);
147 
148  caffe2::NetDef netdef;
149  {
150  auto& op = *(netdef.add_op());
151  op.set_type("CopyToOpenGL");
152  op.add_input("X_cpu");
153  op.add_output("X_gl");
154  {
155  auto& arg = *(op.add_arg());
156  arg.set_name("tile_x");
157  arg.set_i(tile_x);
158  }
159  {
160  auto& arg = *(op.add_arg());
161  arg.set_name("tile_y");
162  arg.set_i(tile_y);
163  }
164  }
165 
166  {
167  auto& op = *(netdef.add_op());
168  op.set_type(transposed ? "OpenGLConvTranspose" : "OpenGLConv");
169  op.add_input("X_gl");
170  {
171  op.add_input("W");
172  op.add_input("b");
173  }
174  {
175  auto& arg = *(op.add_arg());
176  arg.set_name("order");
177  arg.set_s("NCHW");
178  }
179  {
180  auto& arg = *(op.add_arg());
181  arg.set_name("kernel");
182  arg.set_i(kernel_height);
183  }
184  {
185  auto& arg = *(op.add_arg());
186  arg.set_name("pad");
187  arg.set_i(input_padding);
188  }
189  {
190  auto& arg = *(op.add_arg());
191  arg.set_name("stride");
192  arg.set_i(input_stride);
193  }
194  {
195  auto& arg = *(op.add_arg());
196  arg.set_name("is_last");
197  arg.set_i(1);
198  }
199  op.add_output("Y_gl");
200  }
201 
202  std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
203 
204  for (auto& op : netdef.op()) {
205  ops.push_back(CreateOperator(op, ws));
206  }
207 
208  // Run the Copy Operator
209  ops[0]->Run();
210 
211  // Make sure the tested operator is precompiled
212  ops[1]->Run();
213  glFinish();
214 
215  // Measure one iteration
216  caffe2::Timer timer;
217  timer.Start();
218 
219  ops[1]->Run();
220  glFinish();
221 
222  float one_iteration = timer.MilliSeconds();
223 
224  int target_iterations = std::max((int)(1000 / one_iteration), 1);
225  int warmup_iterations = std::max((int)(200 / one_iteration), 1);
226 
227  // warm up
228  for (int i = 0; i < warmup_iterations; i++) {
229  ops[1]->Run();
230  }
231  glFinish();
232 
233  timer.Start();
234 
235  int runs = target_iterations;
236  for (int i = 0; i < runs; i++) {
237  ops[1]->Run();
238  }
239  glFinish();
240 
241  const double gpuIterTime = double(timer.MilliSeconds()) / runs;
242 
243  gl_log(GL_LOG,
244  "%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
245  transposed ? "ConvTranspose" : "Conv",
246  input_channels,
247  output_channels,
248  input_width,
249  input_height,
250  kernel_width,
251  kernel_height,
252  gpuIterTime);
253 
254  return gpuIterTime;
255 }
256 
257 void TestGLConvolution() {
259  ws.GetThreadPool()->setMinWorkSize(0);
260 
261  // small input sizes
262  // std::vector<int> sizes({14, 26, 52, 104});
263  // std::vector<int> channels({128, 64}); // not working for 512 and 256 channels yet
264  // std::vector<int> channels({512, 256, 128, 64});
265 
266  // large input sizes
267  // std::vector<int> sizes({208, 312, 416, 720, 1080});
268  // std::vector<int> channels({16, 4});
269  //
270  std::vector<int> sizes({14, 26, 52, 104, 208});
271  // std::vector<int> channels({24, 16, 4});
272 
273  // std::vector<int> sizes({14});
274  std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
275 
276  std::vector<int> kernels({3});
277 
278  bool transposed = false;
279 
280  int stride = 1;
281 
282  for (const auto& space : sizes) {
283  for (const auto& input_channel : channels) {
284  int output_channel = input_channel;
285  /* for (const auto& output_channel : channels) */ {
286  for (const auto& kernel : kernels) {
287  const double gpuIterTime = BenchGLConvolution<float16_t>(
288  input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
289  const double cpuIterTime = BenchOp(transposed ? "ConvTranspose" : "Conv",
290  input_channel,
291  output_channel,
292  kernel,
293  kernel,
294  stride,
295  space,
296  space,
297  transposed,
298  &ws);
299  const double flops = double(input_channel) * output_channel * kernel * kernel *
300  (kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
301  // gl_log(GL_LOG,
302  printf(
303  "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:"
304  "%.2f\tratio: "
305  "%.2f\n",
306  space,
307  space,
308  input_channel,
309  output_channel,
310  kernel,
311  kernel,
312  flops / gpuIterTime / 1E6,
313  flops / cpuIterTime / 1E6,
314  cpuIterTime / gpuIterTime);
315  }
316  }
317  }
318  }
319 
320  // // ConvTranspose
321  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 2, true);
322  // BenchGLConvolution<float16_t>(16, 16, 4, 4, 640, 360, 0, 2, true);
323  // BenchGLConvolution<float16_t>(16, 16, 5, 5, 640, 360, 0, 2, true);
324  // BenchGLConvolution<float16_t>(16, 16, 6, 6, 640, 360, 0, 2, true);
325  // BenchGLConvolution<float16_t>(16, 16, 7, 7, 640, 360, 0, 2, true);
326  // BenchGLConvolution<float16_t>(16, 16, 8, 8, 640, 360, 0, 2, true);
327  // BenchGLConvolution<float16_t>(16, 16, 9, 9, 640, 360, 0, 2, true);
328  //
329  // BenchOp("ConvTranspose", 16, 16, 3, 3, 2, 640, 360, true);
330  // BenchOp("ConvTranspose", 16, 16, 4, 4, 2, 640, 360, true);
331  // BenchOp("ConvTranspose", 16, 16, 5, 5, 2, 640, 360, true);
332  // BenchOp("ConvTranspose", 16, 16, 6, 6, 2, 640, 360, true);
333  // BenchOp("ConvTranspose", 16, 16, 7, 7, 2, 640, 360, true);
334  // BenchOp("ConvTranspose", 16, 16, 8, 8, 2, 640, 360, true);
335  // BenchOp("ConvTranspose", 16, 16, 9, 9, 2, 640, 360, true);
336  //
337  // // Conv
338  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
339  // BenchGLConvolution<float16_t>(16, 16, 4, 4, 1280, 720, 0, 1, false);
340  // BenchGLConvolution<float16_t>(16, 16, 5, 5, 1280, 720, 0, 1, false);
341  // BenchGLConvolution<float16_t>(16, 16, 6, 6, 1280, 720, 0, 1, false);
342  // BenchGLConvolution<float16_t>(16, 16, 7, 7, 1280, 720, 0, 1, false);
343  // BenchGLConvolution<float16_t>(16, 16, 8, 8, 1280, 720, 0, 1, false);
344  // BenchGLConvolution<float16_t>(16, 16, 9, 9, 1280, 720, 0, 1, false);
345  //
346  // BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
347  // BenchOp("Conv", 16, 16, 4, 4, 1, 1280, 720, false);
348  // BenchOp("Conv", 16, 16, 5, 5, 1, 1280, 720, false);
349  // BenchOp("Conv", 16, 16, 6, 6, 1, 1280, 720, false);
350  // BenchOp("Conv", 16, 16, 7, 7, 1, 1280, 720, false);
351  // BenchOp("Conv", 16, 16, 8, 8, 1, 1280, 720, false);
352  // BenchOp("Conv", 16, 16, 9, 9, 1, 1280, 720, false);
353 
354  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 80, 45, 0, 1, false);
355  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 160, 90, 0, 1, false);
356  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 320, 180, 0, 1, false);
357  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 640, 360, 0, 1, false);
358  // BenchGLConvolution<float16_t>(16, 16, 3, 3, 1280, 720, 0, 1, false);
359  //
360  // BenchOp("Conv", 16, 16, 3, 3, 1, 80, 45, false);
361  // BenchOp("Conv", 16, 16, 3, 3, 1, 160, 90, false);
362  // BenchOp("Conv", 16, 16, 3, 3, 1, 320, 180, false);
363  // BenchOp("Conv", 16, 16, 3, 3, 1, 640, 360, false);
364  // BenchOp("Conv", 16, 16, 3, 3, 1, 1280, 720, false);
365  //
366  // BenchGLConvolution<float16_t>(128, 128, 3, 3, 14, 14, 0, 1, false);
367  // BenchGLConvolution<float16_t>(256, 256, 3, 3, 14, 14, 0, 1, false);
368  // BenchGLConvolution<float16_t>(128, 128, 3, 3, 28, 28, 0, 1, false);
369  // BenchGLConvolution<float16_t>(256, 256, 3, 3, 28, 28, 0, 1, false);
370  // BenchGLConvolution<float16_t>(128, 128, 3, 3, 56, 56, 0, 1, false);
371  // BenchGLConvolution<float16_t>(256, 256, 3, 3, 56, 56, 0, 1, false);
372  // BenchGLConvolution<float16_t>(64, 64, 7, 7, 128, 128, 0, 1, false);
373  //
374  // BenchOp("Conv", 128, 128, 3, 3, 1, 14, 14, false);
375  // BenchOp("Conv", 256, 256, 3, 3, 1, 14, 14, false);
376  // BenchOp("Conv", 128, 128, 3, 3, 1, 28, 28, false);
377  // BenchOp("Conv", 256, 256, 3, 3, 1, 28, 28, false);
378  // BenchOp("Conv", 128, 128, 3, 3, 1, 56, 56, false);
379  // BenchOp("Conv", 256, 256, 3, 3, 1, 56, 56, false);
380  // BenchOp("Conv", 64, 64, 7, 7, 1, 128, 128, false);
381 }
Blob is a general container that hosts a typed pointer.
Definition: blob.h:25
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
Definition: workspace.cc:104
void Start()
Starts a timer.
Definition: timer.h:24
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
float MilliSeconds()
Returns the elapsed time in milliseconds.
Definition: timer.h:32
T * GetMutable(bool *is_new_object=nullptr)
Gets a mutable pointer to the stored object.
Definition: blob.h:101
A simple timer object for measuring time.
Definition: timer.h:16