2 #include "caffe2/core/operator.h" 3 #include "caffe2/core/timer.h" 4 #include "caffe2/core/workspace.h" 5 #include "caffe2/utils/math.h" 7 #include "../core/GL.h" 8 #include "../core/GLLogging.h" 9 #include "../core/arm_neon_support.h" 10 #include "../operators/gl_tiling_utils.h" 11 #include "TestGLConvolution.h" 15 void AddNoiseInput(
const std::vector<caffe2::TIndex>& shape,
16 const std::string& name,
21 tensor->Resize(shape);
23 caffe2::math::RandGaussian<float, caffe2::CPUContext>(
24 tensor->size(), 0.0f, 10.0f, tensor->mutable_data<
float>(), &context);
27 double BenchOp(
const std::string& typ,
42 const char* engine = transposed ?
"MOBILE" :
"NNPACK";
44 caffe2::OperatorDef def1;
45 def1.set_name(
"test");
47 def1.set_engine(engine);
53 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"kernel_h", kH));
54 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"kernel_w", kW));
55 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"stride_h", stride));
56 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"stride_w", stride));
57 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"pad_t", 0));
58 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"pad_l", 0));
59 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"pad_b", 0));
60 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"pad_r", 0));
61 def1.add_arg()->CopyFrom(caffe2::MakeArgument(
"convolution_transform_strategy", std::string(
"PRECOMPUTE")));
63 AddNoiseInput(std::vector<caffe2::TIndex>{1, inputC, inH, inW},
"X", ws);
65 AddNoiseInput(std::vector<caffe2::TIndex>{inputC, outputC, kH, kW},
"W", ws);
67 AddNoiseInput(std::vector<caffe2::TIndex>{outputC, inputC, kH, kW},
"W", ws);
69 AddNoiseInput(std::vector<caffe2::TIndex>{outputC},
"B", ws);
71 std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(def1, ws));
81 int target_iterations = std::max((
int)(1000 / one_iteration), 1);
82 int warmup_iterations = std::max((
int)(200 / one_iteration), 1);
85 for (
int i = 0; i < warmup_iterations; i++) {
91 int runs = target_iterations;
92 for (
int i = 0; i < runs; i++) {
99 "%s(%d -> %d, %dx%d - %dx%d - %s) took: %.4f ms/iter\n",
109 return double(total_t) / runs;
112 template <
typename T>
113 static double BenchGLConvolution(
int input_channels,
123 int tile_x = 1, tile_y = 1;
124 caffe2::squareFactors((input_channels + 3) / 4, tile_x, tile_y);
126 gl_log(GL_LOG,
"Input Tiles Factors: %d, %d\n", tile_x, tile_y);
134 std::vector<caffe2::TIndex>{1, input_channels, input_height, input_width},
"X_cpu", ws);
137 std::vector<caffe2::TIndex>{input_channels, output_channels, kernel_height, kernel_width},
142 std::vector<caffe2::TIndex>{output_channels, input_channels, kernel_height, kernel_width},
146 AddNoiseInput(std::vector<caffe2::TIndex>{output_channels},
"b", ws);
148 caffe2::NetDef netdef;
150 auto& op = *(netdef.add_op());
151 op.set_type(
"CopyToOpenGL");
152 op.add_input(
"X_cpu");
153 op.add_output(
"X_gl");
155 auto& arg = *(op.add_arg());
156 arg.set_name(
"tile_x");
160 auto& arg = *(op.add_arg());
161 arg.set_name(
"tile_y");
167 auto& op = *(netdef.add_op());
168 op.set_type(transposed ?
"OpenGLConvTranspose" :
"OpenGLConv");
169 op.add_input(
"X_gl");
175 auto& arg = *(op.add_arg());
176 arg.set_name(
"order");
180 auto& arg = *(op.add_arg());
181 arg.set_name(
"kernel");
182 arg.set_i(kernel_height);
185 auto& arg = *(op.add_arg());
187 arg.set_i(input_padding);
190 auto& arg = *(op.add_arg());
191 arg.set_name(
"stride");
192 arg.set_i(input_stride);
195 auto& arg = *(op.add_arg());
196 arg.set_name(
"is_last");
199 op.add_output(
"Y_gl");
202 std::vector<std::unique_ptr<caffe2::OperatorBase>> ops;
204 for (
auto& op : netdef.op()) {
205 ops.push_back(CreateOperator(op, ws));
224 int target_iterations = std::max((
int)(1000 / one_iteration), 1);
225 int warmup_iterations = std::max((
int)(200 / one_iteration), 1);
228 for (
int i = 0; i < warmup_iterations; i++) {
235 int runs = target_iterations;
236 for (
int i = 0; i < runs; i++) {
241 const double gpuIterTime = double(timer.
MilliSeconds()) / runs;
244 "%s(%d -> %d, %dx%d - %dx%d - OpenGL) took: %.4f ms/iter\n",
245 transposed ?
"ConvTranspose" :
"Conv",
257 void TestGLConvolution() {
259 ws.GetThreadPool()->setMinWorkSize(0);
270 std::vector<int> sizes({14, 26, 52, 104, 208});
274 std::vector<int> channels({32, 64, 128, 192, 256, 384, 512});
276 std::vector<int> kernels({3});
278 bool transposed =
false;
282 for (
const auto& space : sizes) {
283 for (
const auto& input_channel : channels) {
284 int output_channel = input_channel;
286 for (
const auto& kernel : kernels) {
287 const double gpuIterTime = BenchGLConvolution<float16_t>(
288 input_channel, output_channel, kernel, kernel, space, space, 0, stride, transposed, &ws);
289 const double cpuIterTime = BenchOp(transposed ?
"ConvTranspose" :
"Conv",
299 const double flops = double(input_channel) * output_channel * kernel * kernel *
300 (kernel == 1 ? space : space - 2) * (kernel == 1 ? space : space - 2) * 2;
303 "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t16b GPU GFLOPS: %.2f\t32b CPU GFLOPS:" 312 flops / gpuIterTime / 1E6,
313 flops / cpuIterTime / 1E6,
314 cpuIterTime / gpuIterTime);
Blob is a general container that hosts a typed pointer.
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
void Start()
Starts a timer.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
float MilliSeconds()
Returns the elapsed time in milliseconds.
T * GetMutable(bool *is_new_object=nullptr)
Gets a mutable pointer to the stored object.
A simple timer object for measuring time.