1 #include "caffe2/mobile/contrib/arm-compute/core/net_gl.h" 2 #include "caffe2/mobile/contrib/arm-compute/core/context.h" 3 #include "caffe2/core/net.h" 7 #include <unordered_map> 8 #include <unordered_set> 10 #include "caffe2/core/operator.h" 11 #include "caffe2/core/static_tracepoint.h" 12 #include "caffe2/core/timer.h" 13 #include "caffe2/proto/caffe2.pb.h" 14 #include "caffe2/utils/proto_utils.h" 19 const std::shared_ptr<const NetDef>& net_def,
21 : NetBase(net_def, ws) {
23 VLOG(1) <<
"Constructing GLNet " << net_def->name();
24 const bool net_def_has_device_option = net_def->has_device_option();
26 for (
int idx = 0; idx < net_def->op_size(); ++idx) {
27 const auto& operator_def = net_def->op(idx);
28 VLOG(1) <<
"Creating operator " << operator_def.name() <<
": " 29 << operator_def.type();
30 output_blobs_.push_back(operator_def.output(0));
31 if (operator_def.has_device_option() && operator_def.device_option().device_type() == OPENGL) {
32 opengl_device_.push_back(
true);
34 opengl_device_.push_back(
false);
37 std::unique_ptr<OperatorBase> op{
nullptr};
38 if (!operator_def.has_device_option() && net_def_has_device_option) {
42 OperatorDef temp_def(operator_def);
43 temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
44 op = CreateOperator(temp_def, ws, idx);
46 op = CreateOperator(operator_def, ws, idx);
48 std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
50 operators_.emplace_back(std::move(op));
58 for (
auto& op: operators_) {
59 if (op->device_option().device_type() == OPENGL) {
64 VLOG(1) <<
"Running net " << name_;
65 for (
auto& op : operators_) {
68 LOG(ERROR) <<
"Operator failed: " << ProtoDebugString(op->debug_def());
76 bool GLNet::RunAsync() {
81 template <
typename A,
typename B>
82 bool PairLargerThan(
const std::pair<A, B>& x,
const std::pair<A, B>& y) {
83 return x.second > y.second;
88 const int warmup_runs,
90 const bool run_individual) {
91 std::cout <<
"Starting benchmark." << std::endl;
92 std::cout <<
"Running warmup runs." << std::endl;
95 "Number of warm up runs should be non negative, provided ",
98 for (
int i = 0; i < warmup_runs; ++i) {
99 CAFFE_ENFORCE(Run(),
"Warmup run ", i,
" has failed.");
102 auto last_blob = output_blobs_[output_blobs_.size() - 1];
103 Blob *gpu_out_blob = ws_->GetBlob(last_blob);
108 std::cout <<
"Main runs." << std::endl;
111 "Number of main runs should be non negative, provided ",
115 for (
int i = 0; i < main_runs; ++i) {
116 CAFFE_ENFORCE(Run(),
"Main run ", i,
" has failed.");
121 std::cout <<
"[C2DEBUG] Main run finished. Milliseconds per iter: " 122 << millis / main_runs
123 <<
". Iters per second: " << 1000.0 * main_runs / millis << std::endl;
125 vector<float> time_per_op(operators_.size(), 0);
126 vector<uint64_t> flops_per_op(operators_.size(), 0);
127 CaffeMap<string, float> time_per_op_type;
128 if (run_individual) {
129 for (
int i = 0; i < main_runs; ++i) {
130 for (
auto& op : operators_) {
134 for (
auto& op : operators_) {
135 const string& op_type = op->debug_def().type();
137 auto* schema = OpSchemaRegistry::Schema(op_type);
138 if (schema && schema->HasCostInferenceFunction()) {
139 vector<TensorShape> shapes = op->InputTensorShapes();
141 schema->InferCost(op->debug_def(), shapes).flops;
148 op->debug_def().name(),
152 if (opengl_device_[idx]) {
153 Blob *gpu_out_blob = ws_->GetBlob(output_blobs_[idx]);
158 time_per_op[idx] += spent;
159 time_per_op_type[op_type] += spent;
165 for (
auto& op : operators_) {
166 const string& op_type = op->debug_def().type();
167 const string& print_name =
168 (op->debug_def().name().size()
169 ? op->debug_def().name()
170 : (op->debug_def().output_size() ? op->debug_def().output(0)
172 std::stringstream flops_str;
173 if (flops_per_op[idx]) {
175 << to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
178 std::cout <<
"[C2DEBUG] Operator #" << idx <<
" (" << print_name <<
", " << op_type
179 <<
") " << time_per_op[idx] / main_runs <<
" ms/iter" 180 << flops_str.str() << std::endl;
183 std::cout <<
"[C2DEBUG] Time per operator type:" << std::endl;
185 std::vector<std::pair<string, float>> time_per_op_type_vec(
186 time_per_op_type.begin(), time_per_op_type.end());
188 time_per_op_type_vec.begin(),
189 time_per_op_type_vec.end(),
190 PairLargerThan<string, float>);
191 for (
const auto& item : time_per_op_type_vec) {
192 std::cout <<
"[C2DEBUG] " << std::setw(15) << std::setfill(
' ') << item.second / main_runs
193 <<
" " << item.first << std::endl;
197 for (
int i = 0; i < time_per_op.size(); ++i) {
198 time_per_op[i] /= main_runs;
200 time_per_op.insert(time_per_op.begin(), millis / main_runs);
204 REGISTER_NET(opengl,
GLNet);
Blob is a general container that hosts a typed pointer.
void Start()
Starts a timer.
vector< float > TEST_Benchmark(const int warmup_runs, const int main_runs, const bool run_individual) override
Benchmarks a network.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
float MilliSeconds()
Returns the elapsed time in milliseconds.
const T & Get() const
Gets the const reference of the stored object.
A simple timer object for measuring time.