3 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_ 4 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_ 6 #include "caffe2/core/common.h" 9 #error "mobile build state not defined" 14 #include "caffe2/core/logging.h" 15 #include "caffe2/operators/conv_op_shared.h" 16 #include "caffe2/operators/conv_transpose_op_mobile.h" 17 #include "caffe2/utils/cpu_neon.h" 18 #include "caffe2/utils/fixed_divisor.h" 19 #include "caffe2/utils/math.h" 21 CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
25 template <
typename T,
typename Context>
26 void runTileContiguous(
48 auto kernelDataSize = C * kernelH * kernelW;
49 auto currentTileStart = tileSize * tileId;
52 math::GemmEx<T, Context>(
61 Xdata + currentTileStart,
78 int colBlockSize = (W + kernelW / strideW);
79 int numColBlocks = strideW;
81 for (
int c = 0; c < kernelDataSize; ++c) {
82 int w_offset = c % kernelW;
83 int h_offset = (c / kernelW) % kernelH;
84 int c_im = c / kernelH / kernelW;
89 int rowY = tileId * strideH - padT + h_offset;
92 if (!math::is_a_ge_zero_and_a_lt_b(rowY, outputH)) {
97 constexpr
int kPadL = 0;
98 int colOffsetStart = -kPadL + w_offset;
99 int colBlockY = colOffsetStart % strideW;
104 int colWithinBlockOffsetY = colOffsetStart / strideW;
107 int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
111 int offsetY = rowY * colBlockSize * numColBlocks + colY;
113 T* colBufferPointer = colBufferData + c * tileSize;
115 Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
121 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float)) * 4;
122 int limit = (tileSize / kUnroll) * kUnroll;
124 for (; b < limit; b += kUnroll) {
125 float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
126 float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
127 float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
128 float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
130 float32x4_t y0 = vld1q_f32(yPointer + 0);
131 float32x4_t y1 = vld1q_f32(yPointer + 4);
132 float32x4_t y2 = vld1q_f32(yPointer + 8);
133 float32x4_t y3 = vld1q_f32(yPointer + 12);
135 y0 = vaddq_f32(y0, cb0);
136 y1 = vaddq_f32(y1, cb1);
137 y2 = vaddq_f32(y2, cb2);
138 y3 = vaddq_f32(y3, cb3);
140 vst1q_f32(yPointer + 0, y0);
141 vst1q_f32(yPointer + 4, y1);
142 vst1q_f32(yPointer + 8, y2);
143 vst1q_f32(yPointer + 12, y3);
145 colBufferPointer += kUnroll;
151 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
152 int limit = (tileSize / kUnroll) * kUnroll;
154 for (; b < limit; b += kUnroll) {
155 float32x4_t cb0 = vld1q_f32(colBufferPointer);
156 float32x4_t y0 = vld1q_f32(yPointer);
158 y0 = vaddq_f32(y0, cb0);
160 vst1q_f32(yPointer, y0);
162 colBufferPointer += kUnroll;
169 for (; b < tileSize; ++b) {
170 *yPointer += *colBufferPointer;
177 template <
typename T,
int N>
178 struct StoreInterleaved {};
181 struct StoreInterleaved<float, 1> {
183 inline static void store(
float* p, float32x4_t v[1]) {
188 inline static void store(
float* p,
float v[1]) {
194 struct StoreInterleaved<float, 2> {
196 inline static void store(
float* p, float32x4_t v[2]) {
197 float32x4x2_t x = {{v[0], v[1]}};
202 inline static void store(
float* p,
float v[2]) {
209 struct StoreInterleaved<float, 3> {
211 inline static void store(
float* p, float32x4_t v[3]) {
212 float32x4x3_t x = {{v[0], v[1], v[2]}};
217 inline static void store(
float* p,
float v[3]) {
225 struct StoreInterleaved<float, 4> {
227 inline static void store(
float* p, float32x4_t v[4]) {
228 float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
233 inline static void store(
float* p,
float v[4]) {
241 template <
int kStr
ideW>
242 void reinterleaveRows(
259 int colBlockSize = inputW + kernelW / kStrideW;
260 int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
262 int point = c * outputH + h;
263 src += point * colBlockSize * kStrideW;
264 dst += point * outputW;
266 float b = bias ? bias[c] : 0;
268 float32x4_t biasV = vdupq_n_f32(b);
273 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float)) * 2;
274 int limit = ((inputW - 1) / kUnroll) * kUnroll;
276 for (; w < limit; w += kUnroll) {
278 float32x4_t v0[kStrideW];
279 float32x4_t v1[kStrideW];
281 for (
int i = 0; i < kStrideW; ++i) {
282 v0[i] = vld1q_f32(src + i * colBlockSize);
283 v1[i] = vld1q_f32(src + i * colBlockSize + 4);
287 for (
int i = 0; i < kStrideW; ++i) {
288 v0[i] = vaddq_f32(v0[i], biasV);
289 v1[i] = vaddq_f32(v1[i], biasV);
293 StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
294 StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
297 dst += kUnroll * kStrideW;
302 for (; w < inputW - 1; ++w) {
305 for (
int i = 0; i < kStrideW; ++i) {
306 v[i] = src[i * colBlockSize];
310 for (
int i = 0; i < kStrideW; ++i) {
315 StoreInterleaved<float, kStrideW>::store(dst, v);
323 int outputPoint = (inputW - 1) * kStrideW;
328 while (outputPoint < noAdjOutputW) {
329 float v = src[block * colBlockSize];
335 if (block >= kStrideW) {
343 for (; outputPoint < outputW; ++outputPoint) {
349 template <
int N,
typename T,
typename Context>
350 void reinterleaveMultithreaded(
363 size_t totalTiles = (size_t)outputC * outputH;
364 FixedDivisor<int> divOutputH(outputH);
366 #define REINTERLEAVE(N) \ 368 reinterleaveRows<N>( \ 383 std::function<void(int, size_t)> fnReinterleave = [&](
int threadId,
387 divOutputH.divMod((
int)tileId, c, h);
394 pool->run(fnReinterleave, totalTiles);
400 static void sumInto(
float* acc,
float** toSum,
size_t size);
404 struct SumMultiple<1> {
405 static void sumInto(
float* acc,
float** toSum,
size_t size) {
406 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
407 int limit = (size / kUnroll) * kUnroll;
409 auto toSum0 = toSum[0];
412 for (; i < limit; i += kUnroll) {
413 float32x4_t v0 = vld1q_f32_aligned(acc + i);
414 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
416 v0 = vaddq_f32(v0, v1);
418 vst1q_f32_aligned(acc + i, v0);
421 for (; i < size; ++i) {
423 float v1 = toSum0[i];
433 struct SumMultiple<2> {
434 static void sumInto(
float* acc,
float** toSum,
size_t size) {
435 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
436 int limit = (size / kUnroll) * kUnroll;
438 auto toSum0 = toSum[0];
439 auto toSum1 = toSum[1];
442 for (; i < limit; i += kUnroll) {
443 float32x4_t v0 = vld1q_f32_aligned(acc + i);
444 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
445 float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
447 v0 = vaddq_f32(v0, v1);
448 v0 = vaddq_f32(v0, v2);
450 vst1q_f32_aligned(acc + i, v0);
453 for (; i < size; ++i) {
455 float v1 = toSum0[i];
456 float v2 = toSum1[i];
467 struct SumMultiple<3> {
468 static void sumInto(
float* acc,
float** toSum,
size_t size) {
469 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
470 int limit = (size / kUnroll) * kUnroll;
472 auto toSum0 = toSum[0];
473 auto toSum1 = toSum[1];
474 auto toSum2 = toSum[2];
477 for (; i < limit; i += kUnroll) {
478 float32x4_t v0 = vld1q_f32_aligned(acc + i);
479 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
480 float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
481 float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
483 v0 = vaddq_f32(v0, v1);
484 v2 = vaddq_f32(v2, v3);
485 v0 = vaddq_f32(v0, v2);
487 vst1q_f32_aligned(acc + i, v0);
490 for (; i < size; ++i) {
492 float v1 = toSum0[i];
493 float v2 = toSum1[i];
494 float v3 = toSum2[i];
507 void sumInto(
float* acc, std::vector<float*>& toSum,
size_t size) {
509 if (toSum.size() == 1) {
510 SumMultiple<1>::sumInto(acc, toSum.data(), size);
512 }
else if (toSum.size() == 2) {
513 SumMultiple<2>::sumInto(acc, toSum.data(), size);
515 }
else if (toSum.size() == 3) {
516 SumMultiple<3>::sumInto(acc, toSum.data(), size);
522 EigenVectorArrayMap<float> accT(acc, size);
524 for (
auto p : toSum) {
525 accT += ConstEigenVectorArrayMap<float>(p, size);
529 template <
typename T,
class Context>
530 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
532 auto& filter = Input(FILTER);
534 const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
535 CAFFE_ENFORCE(filter.ndim() == 4,
"filter must be 4D tensor");
537 filter.dim32(0) == M,
538 "filter number must be equal to input channel number");
539 const int C = filter.dim32(1);
541 filter.dim32(2) == this->kernel_h(),
542 "filter height must be equal to kernel height");
544 filter.dim32(3) == this->kernel_w(),
545 "filter width must be equal to kernel width");
546 if (InputSize() == 3) {
547 auto& bias = Input(BIAS);
548 CAFFE_ENFORCE(bias.ndim() == 1,
"bias must be 1D tensor");
551 "bias dimension must be equal to output channel number");
554 ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
556 const int outputH = Y->dim32(2);
557 const int outputW = Y->dim32(3);
558 const int outputPlaneSize = outputH * outputW;
559 const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
561 auto Xdata = X.template data<T>();
562 auto Ydata = Y->template mutable_data<T>();
564 auto pool = ws_->GetThreadPool();
565 auto numThreads = pool->getNumThreads();
570 size_t colBlockSize = W + this->kernel_w() / this->stride_w();
571 size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
573 size_t threadYBufferSizeAligned =
574 ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
575 size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
578 auto runLocalTile = [&](TensorCPU* threadBuffer,
581 auto localYData = threadBuffer->template mutable_data<T>() +
582 threadId * threadYBufferSizeAligned;
584 auto localColBufferData = threadBuffer->template mutable_data<T>() +
585 numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
587 runTileContiguous<T, Context>(
601 filter.template data<T>(),
609 threadBuffer->Resize(
610 numThreads * threadYBufferSizeAligned +
611 numThreads * threadColBufferSize);
613 std::vector<T*> toSum(numThreads - 1);
614 for (
int i = 1; i < numThreads; ++i) {
615 toSum[i - 1] = threadBuffer->template mutable_data<T>() +
616 i * threadYBufferSizeAligned;
619 for (
auto image_id = 0; image_id < N; ++image_id) {
624 math::Set<T, Context>(
625 numThreads * threadYBufferSizeAligned,
627 threadBuffer->template mutable_data<T>(),
634 [&](
int threadId,
int tileId) {
635 runLocalTile(threadBuffer, threadId, tileId);
643 threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
648 #define REINTERLEAVE(N) \ 650 reinterleaveMultithreaded<N, T, Context>( \ 651 threadBuffer->template mutable_data<T>(), \ 652 InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \ 664 if (this->stride_w() == 1) {
666 }
else if (this->stride_w() == 2) {
668 }
else if (this->stride_w() == 3) {
670 }
else if (this->stride_w() == 4) {
677 Ydata += Y->size() / Y->dim32(0);
680 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
681 runWithSharedBuffer<Context>(ws_, f);
689 template <
typename T,
class Context>
690 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
691 CAFFE_THROW(
"Not implemented.");
696 #endif // CAFFE2_MOBILE 698 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_ A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...