Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op.cc
1 
2 #include <iostream>
3 
4 #include "caffe2/core/common.h"
5 
6 
7 #include "caffe2/core/context.h"
8 #include "caffe2/core/logging.h"
9 #include "caffe2/core/operator.h"
10 #include "caffe2/operators/conv_op_shared.h"
11 #include "caffe2/operators/conv_pool_op_base.h"
12 
13 #include "caffe2/utils/math.h"
14 #include "caffe2/utils/threadpool/pthreadpool_impl.h"
15 #include "nnpack.h"
16 
17 CAFFE2_DEFINE_bool(caffe2_profile_nnpack, false, "");
18 namespace caffe2 {
19 
20 void initNNPACK() {
21  static std::once_flag once;
22  std::call_once(once, []() {
23  enum nnp_status nnpack_status = nnp_initialize();
24  CAFFE_ENFORCE(
25  nnpack_status == nnp_status_success, "NNPack is not supported here!");
26  });
27 }
28 
30 // Definitions
32 
33 class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
34  public:
35  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
36  : ConvPoolOpBase<CPUContext>(operator_def, ws),
37  algorithm_(getConvolutionAlgorithm()),
38  transformStrategy_(getConvolutionTransformStrategy()),
39  ws_(ws) {
40  OPERATOR_NEEDS_FEATURE(
41  this->order_ == StorageOrder::NCHW,
42  "NNPack only supports NCHW order. Please consider add \
43  TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
44  OPERATOR_NEEDS_FEATURE(
45  pad_t() < kernel_h(), "NNPACK only supports pad < kernel size");
46  OPERATOR_NEEDS_FEATURE(
47  pad_b() < kernel_h(), "NNPACK only supports pad < kernel size");
48  OPERATOR_NEEDS_FEATURE(
49  pad_l() < kernel_w(), "NNPACK only supports pad < kernel size");
50  OPERATOR_NEEDS_FEATURE(
51  pad_r() < kernel_w(), "NNPACK only supports pad < kernel size");
52 
53  createSharedBuffer<CPUContext>(ws);
54  }
55 
56  bool RunOnDeviceWithOrderNCHW() override;
57 
58  private:
59  nnp_convolution_algorithm getConvolutionAlgorithm() const;
60  nnp_convolution_transform_strategy getConvolutionTransformStrategy() const;
61 
62  const nnp_convolution_algorithm algorithm_;
63  // Modified after precomputing the kernels. State transitions are:
64  // - precompute -> (first call to Run()) -> reuse (on successful precompute)
65  // -> compute (on failing precompute)
66  // - compute
67  nnp_convolution_transform_strategy transformStrategy_;
68  Workspace* ws_;
69  // Per-group transformed filters
70  std::vector<TensorCPU*> transformedFilters_;
71  // Zero-filled bias for convolutions without bias
72  // This may be needed because NNPACK interface always expects conv with bias
73  std::vector<float> dummyBias_;
74 };
75 
77 // Implementations
79 
80 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm() const {
81  if (!OperatorBase::HasSingleArgumentOfType<std::string>("algo")) {
82  // No preference is stated. Heuristics for the best mobile device
83  // algorithm are different than NNPACK's version, as Winograd
84  // tends to be a lot faster. Use Winograd if the convolution
85  // is 3x3d1s1.
86  if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
87  dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
88  // use Winograd
89  return nnp_convolution_algorithm_wt8x8;
90  }
91 
92  return nnp_convolution_algorithm_auto;
93  }
94 
95  // Otherwise, there is a preference.
96  auto algo = OperatorBase::GetSingleArgument<std::string>("algo", "AUTO");
97  if (algo == "AUTO") {
98  return nnp_convolution_algorithm_auto;
99  }
100  if (algo == "WINOGRAD") {
101  return nnp_convolution_algorithm_wt8x8;
102  }
103  if (algo == "WINOGRAD_FP16") {
104  return nnp_convolution_algorithm_wt8x8_fp16;
105  }
106  if (algo == "FT16") {
107  return nnp_convolution_algorithm_ft16x16;
108  }
109  if (algo == "FT8") {
110  return nnp_convolution_algorithm_ft8x8;
111  }
112  if (algo == "IMPLICIT_GEMM") {
113  return nnp_convolution_algorithm_implicit_gemm;
114  }
115  if (algo == "DIRECT") {
116  return nnp_convolution_algorithm_direct;
117  }
118  return nnp_convolution_algorithm_auto;
119 }
120 
121 nnp_convolution_transform_strategy
122 NNPACKConvOp::getConvolutionTransformStrategy() const {
123  auto kts = OperatorBase::GetSingleArgument<std::string>(
124  "convolution_transform_strategy", "COMPUTE");
125  if (kts == "PRECOMPUTE") {
126  return nnp_convolution_transform_strategy_precompute;
127  }
128  // Default to computing each time.
129  return nnp_convolution_transform_strategy_compute;
130 }
131 
132 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
133  auto& X = Input(0);
134  auto& filter = Input(1);
135  auto* Y = Output(0);
136  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
137  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
138  CAFFE_ENFORCE(filter.ndim() == 4, "");
139  const int M = filter.dim32(0);
140  CAFFE_ENFORCE(C % this->group_ == 0, "");
141  CAFFE_ENFORCE(M % this->group_ == 0, "");
142  CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
143  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
144  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
145  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
146  const int oH = Y->dim32(2), oW = Y->dim32(3);
147 
148  const float* biasData = NULL;
149  if (InputSize() == 3) {
150  /* Convolution with bias */
151  auto& bias = Input(2);
152  CAFFE_ENFORCE(bias.ndim() == 1, "");
153  CAFFE_ENFORCE(bias.dim32(0) == M, "");
154  biasData = bias.template data<float>();
155  } else {
156  /* NNPACK interface requires bias. Use a dummy zero-filled vector. */
157  if (dummyBias_.size() != M) {
158  dummyBias_.resize(M);
159  }
160  biasData = dummyBias_.data();
161  }
162 
163  const size_t batch_size = X.dim32(0);
164  const size_t input_channels = X.dim32(1);
165  const size_t output_channels = Y->dim32(1);
166  const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
167  .height = static_cast<size_t>(X.dim32(2))};
168  // filter is MCHW
169  const nnp_size kernel_size = {.width = static_cast<size_t>(filter.dim32(3)),
170  .height = static_cast<size_t>(filter.dim32(2))};
171  // pad is tblr
172  const nnp_padding padding = {.top = static_cast<size_t>(pad_t()),
173  .right = static_cast<size_t>(pad_r()),
174  .bottom = static_cast<size_t>(pad_b()),
175  .left = static_cast<size_t>(pad_l())};
176 
177  const nnp_size output_subsample = {.width = static_cast<size_t>(stride_w()),
178  .height = static_cast<size_t>(stride_h())};
179  initNNPACK();
180  pthreadpool pool(ws_->GetThreadPool());
181 
182  runWithSharedBuffer<CPUContext>(ws_, [&](Tensor<CPUContext>* buffer) {
183  if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
184  transformedFilters_.resize(group_);
185 
186  size_t transformedFilterSize = 0;
187  nnp_status status = nnp_convolution_inference(
188  algorithm_,
189  nnp_convolution_transform_strategy_precompute,
190  C / group_,
191  M / group_,
192  input_size,
193  padding,
194  kernel_size,
195  output_subsample,
196  nullptr /* input */,
197  nullptr /* filters */,
198  nullptr /* bias */,
199  nullptr /* output */,
200  nullptr /* workspace buffer = transformed filter */,
201  &transformedFilterSize,
202  nnp_activation_identity,
203  nullptr /* activation parameter */,
204  &pool,
205  nullptr /* profile */);
206  if (status == nnp_status_success) {
207  /* For these convolution parameters filter transforms can be
208  * pre-computed */
209 
210  /* Division with rounding up, in case size is not multiple of
211  * sizeof(float) */
212  const size_t transformedFilterElements =
213  (transformedFilterSize + sizeof(float) - 1) / sizeof(float);
214 
215  for (auto g = 0; g < group_; g++) {
216  transformedFilters_[g] =
217  ws_->CreateBlob(
218  debug_def().name() + "_transformed_" + to_string(g))
219  ->GetMutable<TensorCPU>();
220  transformedFilters_[g]->Resize(transformedFilterElements);
221 
222  status = nnp_convolution_inference(
223  algorithm_,
224  nnp_convolution_transform_strategy_precompute,
225  C / group_,
226  M / group_,
227  input_size,
228  padding,
229  kernel_size,
230  output_subsample,
231  nullptr /* input */,
232  filter.template data<float>() + filter.size() / group_ * g,
233  nullptr /* bias */,
234  nullptr /* output */,
235  static_cast<void*>(
236  transformedFilters_[g]->template mutable_data<float>()),
237  &transformedFilterSize,
238  nnp_activation_identity,
239  nullptr /* activation parameter */,
240  &pool,
241  nullptr /* profile */);
242  CAFFE_ENFORCE(
243  nnp_status_success == status,
244  "NNPACK convolution filter pre-transformation return error");
245  }
246 
247  /*
248  * Now, we've precomputed all our filter transformations.
249  * Switch to reuse strategy to avoid doing transformation again on next
250  * iteration.
251  */
252  if (transformStrategy_ ==
253  nnp_convolution_transform_strategy_precompute) {
254  CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
255  transformStrategy_ = nnp_convolution_transform_strategy_reuse;
256  }
257  } else {
258  LOG(WARNING)
259  << "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
260  transformStrategy_ = nnp_convolution_transform_strategy_compute;
261  }
262 
263  // Enforce when we leave this block that we have transitioned out of the
264  // precompute state.
265  CAFFE_ENFORCE(
266  transformStrategy_ != nnp_convolution_transform_strategy_precompute);
267  }
268 
269  CAFFE_ENFORCE(
270  transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
271  transformStrategy_ == nnp_convolution_transform_strategy_compute);
272  const auto N = X.dim32(0);
273  for (auto n = 0; n < N; ++n) {
274  for (auto g = 0; g < group_; ++g) {
275  nnp_profile profile;
276  size_t workspaceSize = buffer->nbytes();
277  if (workspaceSize == 0) {
278  /* Allocate some memory to ensure buffer pointer is not NULL. This
279  * simplifies further logic. */
280  buffer->Resize(1);
281  workspaceSize = buffer->nbytes();
282  }
283  nnp_status status = nnp_convolution_inference(
284  algorithm_,
285  transformStrategy_,
286  C / group_,
287  M / group_,
288  input_size,
289  padding,
290  kernel_size,
291  output_subsample,
292  X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
293  transformStrategy_ == nnp_convolution_transform_strategy_reuse
294  ? transformedFilters_[g]->template data<float>()
295  : filter.template data<float>() + filter.size() / group_ * g,
296  biasData + M / group_ * g,
297  Y->template mutable_data<float>() + n * oH * oW * M +
298  g * oH * oW * (M / group_),
299  static_cast<void*>(buffer->template mutable_data<float>()),
300  &workspaceSize,
301  nnp_activation_identity,
302  nullptr /* activation parameter */,
303  &pool,
304  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
305  if (status == nnp_status_insufficient_buffer) {
306  /* Query required workspace size, increase buffer, and try again */
307  status = nnp_convolution_inference(
308  algorithm_,
309  transformStrategy_,
310  C / group_,
311  M / group_,
312  input_size,
313  padding,
314  kernel_size,
315  output_subsample,
316  nullptr /* input */,
317  nullptr,
318  nullptr /* bias */,
319  nullptr /* output */,
320  nullptr /* workspace buffer */,
321  &workspaceSize,
322  nnp_activation_identity,
323  nullptr /* activation parameter */,
324  &pool,
325  nullptr /* profile */);
326  if (status == nnp_status_success) {
327  /* Division with rounding up, in case size is not multiple of
328  * sizeof(float) */
329  const size_t workspace_elements =
330  (workspaceSize + sizeof(float) - 1) / sizeof(float);
331  buffer->Resize(workspace_elements);
332 
333  /* Try convolution_inference again. If this time it fails, it is
334  * fatal. */
335  status = nnp_convolution_inference(
336  algorithm_,
337  transformStrategy_,
338  C / group_,
339  M / group_,
340  input_size,
341  padding,
342  kernel_size,
343  output_subsample,
344  X.template data<float>() + n * C * H * W +
345  g * H * W * (C / group_),
346  transformStrategy_ == nnp_convolution_transform_strategy_reuse
347  ? transformedFilters_[g]->template data<float>()
348  : filter.template data<float>() +
349  filter.size() / group_ * g,
350  biasData + M / group_ * g,
351  Y->template mutable_data<float>() + n * oH * oW * M +
352  g * oH * oW * (M / group_),
353  static_cast<void*>(buffer->template mutable_data<float>()),
354  &workspaceSize,
355  nnp_activation_identity,
356  nullptr /* activation parameter */,
357  &pool,
358  FLAGS_caffe2_profile_nnpack ? &profile : nullptr);
359  }
360  }
361 
362  VLOG(1) << "NNPACK buffer size: " << buffer->nbytes();
363  CAFFE_ENFORCE(
364  nnp_status_success == status,
365  "NNPACK convolution computation returned error");
366  if (FLAGS_caffe2_profile_nnpack) {
367  char buffer[1024];
368  const double gmacs =
369  double(
370  Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
371  kernel_size.width * kernel_size.height / group_ / group_) /
372  1.0E9;
373  const double gflops = 2 * gmacs / profile.total;
374  auto ret = snprintf(
375  buffer,
376  sizeof(buffer),
377  "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
378  "%4.2f, totalT: %6.3f, inputT: %6.3f, "
379  "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
380  size_t(X.dim(2)),
381  size_t(X.dim(3)),
382  size_t(X.dim(1)),
383  size_t(Y->dim(1)),
384  size_t(kernel_size.width),
385  size_t(output_subsample.width),
386  size_t(padding.top),
387  gmacs,
388  profile.total * 1E3,
389  profile.input_transform * 1E3,
390  profile.kernel_transform * 1E3,
391  profile.block_multiplication * 1E3,
392  profile.output_transform * 1E3,
393  gflops);
394  CAFFE_ENFORCE(ret > 0);
395  std::cout << buffer << std::endl;
396  }
397  }
398  }
399  });
400  return true;
401 }
402 
403 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
404 
405 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...