4 #include "caffe2/core/common.h" 7 #include "caffe2/core/context.h" 8 #include "caffe2/core/logging.h" 9 #include "caffe2/core/operator.h" 10 #include "caffe2/operators/conv_op_shared.h" 11 #include "caffe2/operators/conv_pool_op_base.h" 13 #include "caffe2/utils/math.h" 14 #include "caffe2/utils/threadpool/pthreadpool_impl.h" 17 CAFFE2_DEFINE_bool(caffe2_profile_nnpack,
false,
"");
21 static std::once_flag once;
22 std::call_once(once, []() {
23 enum nnp_status nnpack_status = nnp_initialize();
25 nnpack_status == nnp_status_success,
"NNPack is not supported here!");
37 algorithm_(getConvolutionAlgorithm()),
38 transformStrategy_(getConvolutionTransformStrategy()),
40 OPERATOR_NEEDS_FEATURE(
41 this->order_ == StorageOrder::NCHW,
42 "NNPack only supports NCHW order. Please consider add \ 43 TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
44 OPERATOR_NEEDS_FEATURE(
45 pad_t() < kernel_h(),
"NNPACK only supports pad < kernel size");
46 OPERATOR_NEEDS_FEATURE(
47 pad_b() < kernel_h(),
"NNPACK only supports pad < kernel size");
48 OPERATOR_NEEDS_FEATURE(
49 pad_l() < kernel_w(),
"NNPACK only supports pad < kernel size");
50 OPERATOR_NEEDS_FEATURE(
51 pad_r() < kernel_w(),
"NNPACK only supports pad < kernel size");
53 createSharedBuffer<CPUContext>(ws);
56 bool RunOnDeviceWithOrderNCHW()
override;
59 nnp_convolution_algorithm getConvolutionAlgorithm()
const;
60 nnp_convolution_transform_strategy getConvolutionTransformStrategy()
const;
62 const nnp_convolution_algorithm algorithm_;
67 nnp_convolution_transform_strategy transformStrategy_;
70 std::vector<TensorCPU*> transformedFilters_;
73 std::vector<float> dummyBias_;
80 nnp_convolution_algorithm NNPACKConvOp::getConvolutionAlgorithm()
const {
81 if (!OperatorBase::HasSingleArgumentOfType<std::string>(
"algo")) {
86 if (kernel_h() == 3 && kernel_w() == 3 && dilation_h() == 1 &&
87 dilation_w() == 1 && stride_h() == 1 && stride_w() == 1) {
89 return nnp_convolution_algorithm_wt8x8;
92 return nnp_convolution_algorithm_auto;
96 auto algo = OperatorBase::GetSingleArgument<std::string>(
"algo",
"AUTO");
98 return nnp_convolution_algorithm_auto;
100 if (algo ==
"WINOGRAD") {
101 return nnp_convolution_algorithm_wt8x8;
103 if (algo ==
"WINOGRAD_FP16") {
104 return nnp_convolution_algorithm_wt8x8_fp16;
106 if (algo ==
"FT16") {
107 return nnp_convolution_algorithm_ft16x16;
110 return nnp_convolution_algorithm_ft8x8;
112 if (algo ==
"IMPLICIT_GEMM") {
113 return nnp_convolution_algorithm_implicit_gemm;
115 if (algo ==
"DIRECT") {
116 return nnp_convolution_algorithm_direct;
118 return nnp_convolution_algorithm_auto;
121 nnp_convolution_transform_strategy
122 NNPACKConvOp::getConvolutionTransformStrategy()
const {
123 auto kts = OperatorBase::GetSingleArgument<std::string>(
124 "convolution_transform_strategy",
"COMPUTE");
125 if (kts ==
"PRECOMPUTE") {
126 return nnp_convolution_transform_strategy_precompute;
129 return nnp_convolution_transform_strategy_compute;
132 bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
134 auto& filter = Input(1);
136 CAFFE_ENFORCE(X.ndim() == 4,
"Input dim should be 4");
137 const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
138 CAFFE_ENFORCE(filter.ndim() == 4,
"");
139 const int M = filter.dim32(0);
140 CAFFE_ENFORCE(C % this->group_ == 0,
"");
141 CAFFE_ENFORCE(M % this->group_ == 0,
"");
142 CAFFE_ENFORCE(filter.dim32(1) == C / this->group_,
"");
143 CAFFE_ENFORCE(filter.dim32(2) == kernel_h(),
"");
144 CAFFE_ENFORCE(filter.dim32(3) == kernel_w(),
"");
146 const int oH = Y->dim32(2), oW = Y->dim32(3);
148 const float* biasData = NULL;
149 if (InputSize() == 3) {
151 auto& bias = Input(2);
152 CAFFE_ENFORCE(bias.ndim() == 1,
"");
153 CAFFE_ENFORCE(bias.dim32(0) == M,
"");
154 biasData = bias.template data<float>();
157 if (dummyBias_.size() != M) {
158 dummyBias_.resize(M);
160 biasData = dummyBias_.data();
163 const size_t batch_size = X.dim32(0);
164 const size_t input_channels = X.dim32(1);
165 const size_t output_channels = Y->dim32(1);
166 const nnp_size input_size = {.width =
static_cast<size_t>(X.dim32(3)),
167 .height = static_cast<size_t>(X.dim32(2))};
169 const nnp_size kernel_size = {.width =
static_cast<size_t>(filter.dim32(3)),
170 .height = static_cast<size_t>(filter.dim32(2))};
172 const nnp_padding padding = {.top =
static_cast<size_t>(pad_t()),
173 .right = static_cast<size_t>(pad_r()),
174 .bottom = static_cast<size_t>(pad_b()),
175 .left = static_cast<size_t>(pad_l())};
177 const nnp_size output_subsample = {.width =
static_cast<size_t>(stride_w()),
178 .height = static_cast<size_t>(stride_h())};
183 if (transformStrategy_ == nnp_convolution_transform_strategy_precompute) {
184 transformedFilters_.resize(group_);
186 size_t transformedFilterSize = 0;
187 nnp_status status = nnp_convolution_inference(
189 nnp_convolution_transform_strategy_precompute,
201 &transformedFilterSize,
202 nnp_activation_identity,
206 if (status == nnp_status_success) {
212 const size_t transformedFilterElements =
213 (transformedFilterSize +
sizeof(float) - 1) /
sizeof(float);
215 for (
auto g = 0; g < group_; g++) {
216 transformedFilters_[g] =
218 debug_def().name() +
"_transformed_" + to_string(g))
220 transformedFilters_[g]->Resize(transformedFilterElements);
222 status = nnp_convolution_inference(
224 nnp_convolution_transform_strategy_precompute,
232 filter.template data<float>() + filter.size() / group_ * g,
236 transformedFilters_[g]->template mutable_data<float>()),
237 &transformedFilterSize,
238 nnp_activation_identity,
243 nnp_status_success == status,
244 "NNPACK convolution filter pre-transformation return error");
252 if (transformStrategy_ ==
253 nnp_convolution_transform_strategy_precompute) {
254 CAFFE_ENFORCE_EQ(transformedFilters_.size(), group_);
255 transformStrategy_ = nnp_convolution_transform_strategy_reuse;
259 <<
"Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
260 transformStrategy_ = nnp_convolution_transform_strategy_compute;
266 transformStrategy_ != nnp_convolution_transform_strategy_precompute);
270 transformStrategy_ == nnp_convolution_transform_strategy_reuse ||
271 transformStrategy_ == nnp_convolution_transform_strategy_compute);
272 const auto N = X.dim32(0);
273 for (
auto n = 0; n < N; ++n) {
274 for (
auto g = 0; g < group_; ++g) {
276 size_t workspaceSize = buffer->nbytes();
277 if (workspaceSize == 0) {
281 workspaceSize = buffer->nbytes();
283 nnp_status status = nnp_convolution_inference(
292 X.template data<float>() + n * C * H * W + g * H * W * (C / group_),
293 transformStrategy_ == nnp_convolution_transform_strategy_reuse
294 ? transformedFilters_[g]->template data<float>()
295 : filter.template data<float>() + filter.size() / group_ * g,
296 biasData + M / group_ * g,
297 Y->template mutable_data<float>() + n * oH * oW * M +
298 g * oH * oW * (M / group_),
299 static_cast<void*
>(buffer->template mutable_data<float>()),
301 nnp_activation_identity,
304 FLAGS_caffe2_profile_nnpack ? &profile :
nullptr);
305 if (status == nnp_status_insufficient_buffer) {
307 status = nnp_convolution_inference(
322 nnp_activation_identity,
326 if (status == nnp_status_success) {
329 const size_t workspace_elements =
330 (workspaceSize +
sizeof(float) - 1) /
sizeof(float);
331 buffer->Resize(workspace_elements);
335 status = nnp_convolution_inference(
344 X.template data<float>() + n * C * H * W +
345 g * H * W * (C / group_),
346 transformStrategy_ == nnp_convolution_transform_strategy_reuse
347 ? transformedFilters_[g]->template data<float>()
348 : filter.template data<float>() +
349 filter.size() / group_ * g,
350 biasData + M / group_ * g,
351 Y->template mutable_data<float>() + n * oH * oW * M +
352 g * oH * oW * (M / group_),
353 static_cast<void*
>(buffer->template mutable_data<float>()),
355 nnp_activation_identity,
358 FLAGS_caffe2_profile_nnpack ? &profile :
nullptr);
362 VLOG(1) <<
"NNPACK buffer size: " << buffer->nbytes();
364 nnp_status_success == status,
365 "NNPACK convolution computation returned error");
366 if (FLAGS_caffe2_profile_nnpack) {
370 Y->dim32(2) * Y->dim32(3) * Y->dim32(1) * X.dim32(1) *
371 kernel_size.width * kernel_size.height / group_ / group_) /
373 const double gflops = 2 * gmacs / profile.total;
377 "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: " 378 "%4.2f, totalT: %6.3f, inputT: %6.3f, " 379 "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
384 size_t(kernel_size.width),
385 size_t(output_subsample.width),
389 profile.input_transform * 1E3,
390 profile.kernel_transform * 1E3,
391 profile.block_multiplication * 1E3,
392 profile.output_transform * 1E3,
394 CAFFE_ENFORCE(ret > 0);
395 std::cout << buffer << std::endl;
403 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK,
NNPACKConvOp);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...