Caffe2 - C++ API
A deep learning, cross platform ML framework
net_gl.cc
1 #include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
2 #include "caffe2/mobile/contrib/arm-compute/core/context.h"
3 #include "caffe2/core/net.h"
4 
5 #include <iostream>
6 #include <set>
7 #include <unordered_map>
8 #include <unordered_set>
9 
10 #include "caffe2/core/operator.h"
11 #include "caffe2/core/static_tracepoint.h"
12 #include "caffe2/core/timer.h"
13 #include "caffe2/proto/caffe2.pb.h"
14 #include "caffe2/utils/proto_utils.h"
15 
16 namespace caffe2 {
17 
18 GLNet::GLNet(
19  const std::shared_ptr<const NetDef>& net_def,
20  Workspace* ws)
21  : NetBase(net_def, ws) {
22  ws_ = ws;
23  VLOG(1) << "Constructing GLNet " << net_def->name();
24  const bool net_def_has_device_option = net_def->has_device_option();
25  // Initialize the operators
26  for (int idx = 0; idx < net_def->op_size(); ++idx) {
27  const auto& operator_def = net_def->op(idx);
28  VLOG(1) << "Creating operator " << operator_def.name() << ": "
29  << operator_def.type();
30  output_blobs_.push_back(operator_def.output(0));
31  if (operator_def.has_device_option() && operator_def.device_option().device_type() == OPENGL) {
32  opengl_device_.push_back(true);
33  } else {
34  opengl_device_.push_back(false);
35  }
36 
37  std::unique_ptr<OperatorBase> op{nullptr};
38  if (!operator_def.has_device_option() && net_def_has_device_option) {
39  // In the case that the operator def does not specify a device option but
40  // the net def has a default option, we copy the device option over to the
41  // operator def.
42  OperatorDef temp_def(operator_def);
43  temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
44  op = CreateOperator(temp_def, ws, idx);
45  } else {
46  op = CreateOperator(operator_def, ws, idx);
47  op->set_debug_def(
48  std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
49  }
50  operators_.emplace_back(std::move(op));
51  }
52 }
53 
54 bool GLNet::Run() {
55  StartAllObservers();
56  if (first_run_) {
57  first_run_ = false;
58  for (auto& op: operators_) {
59  if (op->device_option().device_type() == OPENGL) {
60  op->Run();
61  }
62  }
63  }
64  VLOG(1) << "Running net " << name_;
65  for (auto& op : operators_) {
66  bool res = op->Run();
67  if (!res) {
68  LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
69  return false;
70  }
71  }
72  StopAllObservers();
73  return true;
74 }
75 
76 bool GLNet::RunAsync() {
77  return Run();
78 }
79 
80 namespace {
81 template <typename A, typename B>
82 bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
83  return x.second > y.second;
84 }
85 }
86 
87 vector<float> GLNet::TEST_Benchmark(
88  const int warmup_runs,
89  const int main_runs,
90  const bool run_individual) {
91  std::cout << "Starting benchmark." << std::endl;
92  std::cout << "Running warmup runs." << std::endl;
93  CAFFE_ENFORCE(
94  warmup_runs >= 0,
95  "Number of warm up runs should be non negative, provided ",
96  warmup_runs,
97  ".");
98  for (int i = 0; i < warmup_runs; ++i) {
99  CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
100  }
101 
102  auto last_blob = output_blobs_[output_blobs_.size() - 1];
103  Blob *gpu_out_blob = ws_->GetBlob(last_blob);
104  auto &g_ = gpu_out_blob->Get<GLTensor<half>>();
105  // Enforce gpu execution
106  g_.sync();
107 
108  std::cout << "Main runs." << std::endl;
109  CAFFE_ENFORCE(
110  main_runs >= 0,
111  "Number of main runs should be non negative, provided ",
112  main_runs,
113  ".");
114  Timer timer;
115  for (int i = 0; i < main_runs; ++i) {
116  CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
117  }
118  g_.sync();
119 
120  auto millis = timer.MilliSeconds();
121  std::cout << "[C2DEBUG] Main run finished. Milliseconds per iter: "
122  << millis / main_runs
123  << ". Iters per second: " << 1000.0 * main_runs / millis << std::endl;
124 
125  vector<float> time_per_op(operators_.size(), 0);
126  vector<uint64_t> flops_per_op(operators_.size(), 0);
127  CaffeMap<string, float> time_per_op_type;
128  if (run_individual) {
129  for (int i = 0; i < main_runs; ++i) {
130  for (auto& op : operators_) {
131  op->ResetEvent();
132  }
133  int idx = 0;
134  for (auto& op : operators_) {
135  const string& op_type = op->debug_def().type();
136  if (i == 0) { // Gather flops on the first run.
137  auto* schema = OpSchemaRegistry::Schema(op_type);
138  if (schema && schema->HasCostInferenceFunction()) {
139  vector<TensorShape> shapes = op->InputTensorShapes();
140  flops_per_op[idx] =
141  schema->InferCost(op->debug_def(), shapes).flops;
142  }
143  }
144  timer.Start();
145  CAFFE_ENFORCE(
146  op->Run(),
147  "operator ",
148  op->debug_def().name(),
149  "(",
150  op_type,
151  ") has failed.");
152  if (opengl_device_[idx]) {
153  Blob *gpu_out_blob = ws_->GetBlob(output_blobs_[idx]);
154  auto &g_ = gpu_out_blob->Get<GLTensor<half>>();
155  g_.sync();
156  }
157  float spent = timer.MilliSeconds();
158  time_per_op[idx] += spent;
159  time_per_op_type[op_type] += spent;
160  ++idx;
161  }
162  }
163 
164  int idx = 0;
165  for (auto& op : operators_) {
166  const string& op_type = op->debug_def().type();
167  const string& print_name =
168  (op->debug_def().name().size()
169  ? op->debug_def().name()
170  : (op->debug_def().output_size() ? op->debug_def().output(0)
171  : "NO_OUTPUT"));
172  std::stringstream flops_str;
173  if (flops_per_op[idx]) {
174  flops_str << " ("
175  << to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
176  << " GFLOPS)";
177  }
178  std::cout << "[C2DEBUG] Operator #" << idx << " (" << print_name << ", " << op_type
179  << ") " << time_per_op[idx] / main_runs << " ms/iter"
180  << flops_str.str() << std::endl;
181  ++idx;
182  }
183  std::cout << "[C2DEBUG] Time per operator type:" << std::endl;
184  // sort by decreasing time spending.
185  std::vector<std::pair<string, float>> time_per_op_type_vec(
186  time_per_op_type.begin(), time_per_op_type.end());
187  std::sort(
188  time_per_op_type_vec.begin(),
189  time_per_op_type_vec.end(),
190  PairLargerThan<string, float>);
191  for (const auto& item : time_per_op_type_vec) {
192  std::cout << "[C2DEBUG] " << std::setw(15) << std::setfill(' ') << item.second / main_runs
193  << " " << item.first << std::endl;
194  }
195  }
196  // We will reuse time_per_op to return the result of BenchmarkNet.
197  for (int i = 0; i < time_per_op.size(); ++i) {
198  time_per_op[i] /= main_runs;
199  }
200  time_per_op.insert(time_per_op.begin(), millis / main_runs);
201  return time_per_op;
202 }
203 
204 REGISTER_NET(opengl, GLNet);
205 
206 } // namespace caffe2
Blob is a general container that hosts a typed pointer.
Definition: blob.h:25
void Start()
Starts a timer.
Definition: timer.h:24
vector< float > TEST_Benchmark(const int warmup_runs, const int main_runs, const bool run_individual) override
Benchmarks a network.
Definition: net_gl.cc:87
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
float MilliSeconds()
Returns the elapsed time in milliseconds.
Definition: timer.h:32
const T & Get() const
Gets the const reference of the stored object.
Definition: blob.h:75
A simple timer object for measuring time.
Definition: timer.h:16