Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_transpose_op_mobile_impl.h
1 // conv_transpose_op_impl.h is the templated implementation of the
2 // conv_transpose_op.h file.
3 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
4 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
5 
6 #include "caffe2/core/common.h"
7 
8 #ifndef CAFFE2_MOBILE
9 #error "mobile build state not defined"
10 #endif
11 
12 #if CAFFE2_MOBILE
13 
14 #include "caffe2/core/logging.h"
15 #include "caffe2/operators/conv_op_shared.h"
16 #include "caffe2/operators/conv_transpose_op_mobile.h"
17 #include "caffe2/utils/cpu_neon.h"
18 #include "caffe2/utils/fixed_divisor.h"
19 #include "caffe2/utils/math.h"
20 
21 CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
22 
23 namespace caffe2 {
24 
25 template <typename T, typename Context>
26 void runTileContiguous(
27  int tileId,
28  int N,
29  int M,
30  int H,
31  int W,
32  int outputH,
33  int outputW,
34  int C,
35  int kernelH,
36  int kernelW,
37  int strideH,
38  int strideW,
39  int padT,
40  const T* filterData,
41  const T* Xdata,
42  T* colBufferData,
43  T* Ydata,
44  Context* context) {
45  // The tile size is exactly the length of a single row
46  int tileSize = W;
47 
48  auto kernelDataSize = C * kernelH * kernelW;
49  auto currentTileStart = tileSize * tileId;
50 
51  // gemm tile
52  math::GemmEx<T, Context>(
53  CblasTrans,
54  CblasNoTrans,
55  kernelDataSize,
56  tileSize,
57  M,
58  1,
59  filterData,
60  kernelDataSize,
61  Xdata + currentTileStart,
62  H * W,
63  0,
64  colBufferData,
65  tileSize,
66  context);
67 
68  // col2im tile
69  // We assume that there is no padding in the columns (padL and padR
70  // == 0).
71  // FIXME: it is actually possible for us to handle padding, figure
72  // out how to adjust the bounds
73 
74  // We write into Y in a de-interleaved fashion; in other words,
75  // every column (mod strideW) == 0 together in one block,
76  // every column (mod strideW) == 1 in another,
77  // ... and so on.
78  int colBlockSize = (W + kernelW / strideW);
79  int numColBlocks = strideW;
80 
81  for (int c = 0; c < kernelDataSize; ++c) {
82  int w_offset = c % kernelW;
83  int h_offset = (c / kernelW) % kernelH;
84  int c_im = c / kernelH / kernelW;
85 
86  // Each row is a separate tile that we handle. First determine the
87  // row into which we are writing the output.
88  // We can properly handle padding for the rows.
89  int rowY = tileId * strideH - padT + h_offset;
90 
91  // If this row is out of bounds, then skip it
92  if (!math::is_a_ge_zero_and_a_lt_b(rowY, outputH)) {
93  continue;
94  }
95 
96  // FIXME: we don't actually handle a dynamic padL > 0
97  constexpr int kPadL = 0;
98  int colOffsetStart = -kPadL + w_offset;
99  int colBlockY = colOffsetStart % strideW;
100 
101  // However, within a block we may not start writing at offset
102  // 0. The offset at which we begin writing is determined by
103  // colOffsetStart
104  int colWithinBlockOffsetY = colOffsetStart / strideW;
105 
106  // So, this is where we begin reading/writing in Y
107  int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
108 
109  // This is the complete offset into Y from the start
110  // Each row has strideW blocks of size colBlockSize
111  int offsetY = rowY * colBlockSize * numColBlocks + colY;
112 
113  T* colBufferPointer = colBufferData + c * tileSize;
114  T* yPointer =
115  Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
116 
117  int b = 0;
118 #ifdef __ARM_NEON__
119  // We vectorize the loop within the row
120  {
121  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 4;
122  int limit = (tileSize / kUnroll) * kUnroll;
123 
124  for (; b < limit; b += kUnroll) {
125  float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
126  float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
127  float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
128  float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
129 
130  float32x4_t y0 = vld1q_f32(yPointer + 0);
131  float32x4_t y1 = vld1q_f32(yPointer + 4);
132  float32x4_t y2 = vld1q_f32(yPointer + 8);
133  float32x4_t y3 = vld1q_f32(yPointer + 12);
134 
135  y0 = vaddq_f32(y0, cb0);
136  y1 = vaddq_f32(y1, cb1);
137  y2 = vaddq_f32(y2, cb2);
138  y3 = vaddq_f32(y3, cb3);
139 
140  vst1q_f32(yPointer + 0, y0);
141  vst1q_f32(yPointer + 4, y1);
142  vst1q_f32(yPointer + 8, y2);
143  vst1q_f32(yPointer + 12, y3);
144 
145  colBufferPointer += kUnroll;
146  yPointer += kUnroll;
147  }
148  }
149 
150  {
151  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
152  int limit = (tileSize / kUnroll) * kUnroll;
153 
154  for (; b < limit; b += kUnroll) {
155  float32x4_t cb0 = vld1q_f32(colBufferPointer);
156  float32x4_t y0 = vld1q_f32(yPointer);
157 
158  y0 = vaddq_f32(y0, cb0);
159 
160  vst1q_f32(yPointer, y0);
161 
162  colBufferPointer += kUnroll;
163  yPointer += kUnroll;
164  }
165  }
166 #endif
167 
168  // Handle un-vectorizable epilogue
169  for (; b < tileSize; ++b) {
170  *yPointer += *colBufferPointer;
171  ++yPointer;
172  ++colBufferPointer;
173  }
174  }
175 }
176 
177 template <typename T, int N>
178 struct StoreInterleaved {};
179 
180 template <>
181 struct StoreInterleaved<float, 1> {
182 #ifdef __ARM_NEON__
183  inline static void store(float* p, float32x4_t v[1]) {
184  vst1q_f32(p, v[0]);
185  }
186 #endif
187 
188  inline static void store(float* p, float v[1]) {
189  p[0] = v[0];
190  }
191 };
192 
193 template <>
194 struct StoreInterleaved<float, 2> {
195 #ifdef __ARM_NEON__
196  inline static void store(float* p, float32x4_t v[2]) {
197  float32x4x2_t x = {{v[0], v[1]}};
198  vst2q_f32(p, x);
199  }
200 #endif
201 
202  inline static void store(float* p, float v[2]) {
203  p[0] = v[0];
204  p[1] = v[1];
205  }
206 };
207 
208 template <>
209 struct StoreInterleaved<float, 3> {
210 #ifdef __ARM_NEON__
211  inline static void store(float* p, float32x4_t v[3]) {
212  float32x4x3_t x = {{v[0], v[1], v[2]}};
213  vst3q_f32(p, x);
214  }
215 #endif
216 
217  inline static void store(float* p, float v[3]) {
218  p[0] = v[0];
219  p[1] = v[1];
220  p[2] = v[2];
221  }
222 };
223 
224 template <>
225 struct StoreInterleaved<float, 4> {
226 #ifdef __ARM_NEON__
227  inline static void store(float* p, float32x4_t v[4]) {
228  float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
229  vst4q_f32(p, x);
230  }
231 #endif
232 
233  inline static void store(float* p, float v[4]) {
234  p[0] = v[0];
235  p[1] = v[1];
236  p[2] = v[2];
237  p[3] = v[3];
238  }
239 };
240 
241 template <int kStrideW>
242 void reinterleaveRows(
243  const float* src,
244  const float* bias,
245  int c,
246  int h,
247  float* dst,
248  int outputC,
249  int outputH,
250  int outputW,
251  int inputW,
252  int kernelW,
253  int strideW,
254  int adjH) {
255  // Each row in src is of the form:
256  // [w mod strideW == 0 elements]...[w mod strideW == strideW - 1
257  // elements]
258  // We need to re-interleave the values and write them in the output
259  int colBlockSize = inputW + kernelW / kStrideW;
260  int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
261 
262  int point = c * outputH + h;
263  src += point * colBlockSize * kStrideW;
264  dst += point * outputW;
265 
266  float b = bias ? bias[c] : 0;
267 #ifdef __ARM_NEON__
268  float32x4_t biasV = vdupq_n_f32(b);
269 #endif
270 
271  int w = 0;
272 #ifdef __ARM_NEON__
273  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 2;
274  int limit = ((inputW - 1) / kUnroll) * kUnroll;
275 
276  for (; w < limit; w += kUnroll) {
277  // We need to interleave in terms of kStrideW units
278  float32x4_t v0[kStrideW];
279  float32x4_t v1[kStrideW];
280 
281  for (int i = 0; i < kStrideW; ++i) {
282  v0[i] = vld1q_f32(src + i * colBlockSize);
283  v1[i] = vld1q_f32(src + i * colBlockSize + 4);
284  }
285 
286  // add per-channel bias
287  for (int i = 0; i < kStrideW; ++i) {
288  v0[i] = vaddq_f32(v0[i], biasV);
289  v1[i] = vaddq_f32(v1[i], biasV);
290  }
291 
292  // Write interleaved into the output
293  StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
294  StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
295 
296  src += kUnroll;
297  dst += kUnroll * kStrideW;
298  }
299 #endif
300 
301  // Handle non-vectorizable remainder
302  for (; w < inputW - 1; ++w) {
303  float v[kStrideW];
304 
305  for (int i = 0; i < kStrideW; ++i) {
306  v[i] = src[i * colBlockSize];
307  }
308 
309  // add per-channel bias
310  for (int i = 0; i < kStrideW; ++i) {
311  v[i] += b;
312  }
313 
314  // Write interleaved into the output
315  StoreInterleaved<float, kStrideW>::store(dst, v);
316 
317  src += 1;
318  dst += kStrideW;
319  }
320 
321  // We have handled 0 .. (inputW - 1) * stride inclusive so far.
322  // Handle the remainder
323  int outputPoint = (inputW - 1) * kStrideW;
324  int block = 0;
325 
326  // Output width may include adjustment into which we don't
327  // write; ignore it
328  while (outputPoint < noAdjOutputW) {
329  float v = src[block * colBlockSize];
330  dst[0] = v + b;
331  ++outputPoint;
332  dst += 1;
333 
334  ++block;
335  if (block >= kStrideW) {
336  block = 0;
337  src += 1;
338  }
339  }
340 
341  // Remainder of the buffer comprised of just the `adj` must have
342  // bias added
343  for (; outputPoint < outputW; ++outputPoint) {
344  dst[0] = b;
345  dst += 1;
346  }
347 }
348 
349 template <int N, typename T, typename Context>
350 void reinterleaveMultithreaded(
351  const T* y0,
352  const T* bias_data,
353  T* y,
354  int outputC,
355  int outputH,
356  int outputW,
357  int inputW,
358  int kernelW,
359  int strideW,
360  int adjH,
361  ThreadPool* pool) {
362  // # channels times height
363  size_t totalTiles = (size_t)outputC * outputH;
364  FixedDivisor<int> divOutputH(outputH);
365 
366 #define REINTERLEAVE(N) \
367  do { \
368  reinterleaveRows<N>( \
369  y0, \
370  bias_data, \
371  c, \
372  h, \
373  y, \
374  outputC, \
375  outputH, \
376  outputW, \
377  inputW, \
378  kernelW, \
379  strideW, \
380  adjH); \
381  } while (false)
382 
383  std::function<void(int, size_t)> fnReinterleave = [&](int threadId,
384  size_t tileId) {
385  int h;
386  int c;
387  divOutputH.divMod((int)tileId, c, h);
388 
389  REINTERLEAVE(N);
390  };
391 
392 #undef REINTERLEAVE
393 
394  pool->run(fnReinterleave, totalTiles);
395 }
396 
397 #ifdef __ARM_NEON__
398 template <int N>
399 struct SumMultiple {
400  static void sumInto(float* acc, float** toSum, size_t size);
401 };
402 
403 template <>
404 struct SumMultiple<1> {
405  static void sumInto(float* acc, float** toSum, size_t size) {
406  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
407  int limit = (size / kUnroll) * kUnroll;
408 
409  auto toSum0 = toSum[0];
410 
411  size_t i = 0;
412  for (; i < limit; i += kUnroll) {
413  float32x4_t v0 = vld1q_f32_aligned(acc + i);
414  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
415 
416  v0 = vaddq_f32(v0, v1);
417 
418  vst1q_f32_aligned(acc + i, v0);
419  }
420 
421  for (; i < size; ++i) {
422  float v0 = acc[i];
423  float v1 = toSum0[i];
424 
425  v0 += v1;
426 
427  acc[i] = v0;
428  }
429  }
430 };
431 
432 template <>
433 struct SumMultiple<2> {
434  static void sumInto(float* acc, float** toSum, size_t size) {
435  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
436  int limit = (size / kUnroll) * kUnroll;
437 
438  auto toSum0 = toSum[0];
439  auto toSum1 = toSum[1];
440 
441  size_t i = 0;
442  for (; i < limit; i += kUnroll) {
443  float32x4_t v0 = vld1q_f32_aligned(acc + i);
444  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
445  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
446 
447  v0 = vaddq_f32(v0, v1);
448  v0 = vaddq_f32(v0, v2);
449 
450  vst1q_f32_aligned(acc + i, v0);
451  }
452 
453  for (; i < size; ++i) {
454  float v0 = acc[i];
455  float v1 = toSum0[i];
456  float v2 = toSum1[i];
457 
458  v0 += v1;
459  v0 += v2;
460 
461  acc[i] = v0;
462  }
463  }
464 };
465 
466 template <>
467 struct SumMultiple<3> {
468  static void sumInto(float* acc, float** toSum, size_t size) {
469  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
470  int limit = (size / kUnroll) * kUnroll;
471 
472  auto toSum0 = toSum[0];
473  auto toSum1 = toSum[1];
474  auto toSum2 = toSum[2];
475 
476  size_t i = 0;
477  for (; i < limit; i += kUnroll) {
478  float32x4_t v0 = vld1q_f32_aligned(acc + i);
479  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
480  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
481  float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
482 
483  v0 = vaddq_f32(v0, v1);
484  v2 = vaddq_f32(v2, v3);
485  v0 = vaddq_f32(v0, v2);
486 
487  vst1q_f32_aligned(acc + i, v0);
488  }
489 
490  for (; i < size; ++i) {
491  float v0 = acc[i];
492  float v1 = toSum0[i];
493  float v2 = toSum1[i];
494  float v3 = toSum2[i];
495 
496  v0 += v1;
497  v2 += v3;
498  v0 += v2;
499 
500  acc[i] = v0;
501  }
502  }
503 };
504 #endif
505 
506 // Performs acc[i] += sum_j toSum_j[i] pointwise
507 void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
508 #ifdef __ARM_NEON__
509  if (toSum.size() == 1) {
510  SumMultiple<1>::sumInto(acc, toSum.data(), size);
511  return;
512  } else if (toSum.size() == 2) {
513  SumMultiple<2>::sumInto(acc, toSum.data(), size);
514  return;
515  } else if (toSum.size() == 3) {
516  SumMultiple<3>::sumInto(acc, toSum.data(), size);
517  return;
518  }
519 #endif
520 
521  // Otherwise, use fallback implementation
522  EigenVectorArrayMap<float> accT(acc, size);
523 
524  for (auto p : toSum) {
525  accT += ConstEigenVectorArrayMap<float>(p, size);
526  }
527 }
528 
529 template <typename T, class Context>
530 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
531  const Tensor<Context>& X = Input(INPUT);
532  auto& filter = Input(FILTER);
533  Tensor<Context>* Y = Output(0);
534  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
535  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
536  CAFFE_ENFORCE(
537  filter.dim32(0) == M,
538  "filter number must be equal to input channel number");
539  const int C = filter.dim32(1);
540  CAFFE_ENFORCE(
541  filter.dim32(2) == this->kernel_h(),
542  "filter height must be equal to kernel height");
543  CAFFE_ENFORCE(
544  filter.dim32(3) == this->kernel_w(),
545  "filter width must be equal to kernel width");
546  if (InputSize() == 3) {
547  auto& bias = Input(BIAS);
548  CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
549  CAFFE_ENFORCE(
550  bias.dim32(0) == C,
551  "bias dimension must be equal to output channel number");
552  }
553 
554  ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
555 
556  const int outputH = Y->dim32(2);
557  const int outputW = Y->dim32(3);
558  const int outputPlaneSize = outputH * outputW;
559  const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
560 
561  auto Xdata = X.template data<T>();
562  auto Ydata = Y->template mutable_data<T>();
563 
564  auto pool = ws_->GetThreadPool();
565  auto numThreads = pool->getNumThreads();
566 
567  // Initialize per-thread buffers for output
568  // The main thread will write directly into the output Y, we just
569  // need buffers for the worker threads
570  size_t colBlockSize = W + this->kernel_w() / this->stride_w();
571  size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
572  // Require 16 byte alignment, so 4-element alignment as these are floats.
573  size_t threadYBufferSizeAligned =
574  ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
575  size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
576 
577  // Work around GCC 4.9 bug when this is declared inside the inner lambda.
578  auto runLocalTile = [&](TensorCPU* threadBuffer,
579  int threadId,
580  size_t tileId) {
581  auto localYData = threadBuffer->template mutable_data<T>() +
582  threadId * threadYBufferSizeAligned;
583 
584  auto localColBufferData = threadBuffer->template mutable_data<T>() +
585  numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
586 
587  runTileContiguous<T, Context>(
588  tileId,
589  N,
590  M,
591  H,
592  W,
593  outputH,
594  outputW,
595  C,
596  this->kernel_h(),
597  this->kernel_w(),
598  this->stride_h(),
599  this->stride_w(),
600  this->pad_t(),
601  filter.template data<T>(),
602  Xdata,
603  localColBufferData,
604  localYData,
605  &context_);
606  };
607 
608  auto f = [&](Tensor<Context>* threadBuffer) {
609  threadBuffer->Resize(
610  numThreads * threadYBufferSizeAligned +
611  numThreads * threadColBufferSize);
612  // Group together thread buffers for accumulation
613  std::vector<T*> toSum(numThreads - 1);
614  for (int i = 1; i < numThreads; ++i) {
615  toSum[i - 1] = threadBuffer->template mutable_data<T>() +
616  i * threadYBufferSizeAligned;
617  }
618 
619  for (auto image_id = 0; image_id < N; ++image_id) {
620  // Each time through, we have to reset all per-thread output
621  // buffers, since the output buffer is only per-batch element
622  // The column buffers are overwritten by the matrix multiplication
623  // each time, so we need not clear them out each round
624  math::Set<T, Context>(
625  numThreads * threadYBufferSizeAligned,
626  0,
627  threadBuffer->template mutable_data<T>(),
628  &context_);
629 
630  // Run tiled gemm and col2im in our threadpool; all of these tiles
631  // are guaranteed to be full tiles
632  // Each tile handles a single row of the input
633  pool->run(
634  [&](int threadId, int tileId) {
635  runLocalTile(threadBuffer, threadId, tileId);
636  },
637  H);
638 
639  // We need to accumulate the per-thread results into the output
640  // Y; the first worker thread (main thread) already produced its
641  // results in Y
642  sumInto(
643  threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
644 
645 // y0 now contains the final output, but it is in deinterleaved
646 // form. We have to re-interleave it to produce the final form in Y
647 // This operation also handles adding the per-channel bias.
648 #define REINTERLEAVE(N) \
649  do { \
650  reinterleaveMultithreaded<N, T, Context>( \
651  threadBuffer->template mutable_data<T>(), \
652  InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \
653  Ydata, \
654  Y->dim32(1), \
655  Y->dim32(2), \
656  Y->dim32(3), \
657  W, \
658  this->kernel_w(), \
659  this->stride_w(), \
660  this->adj_h(), \
661  pool); \
662  } while (false)
663 
664  if (this->stride_w() == 1) {
665  REINTERLEAVE(1);
666  } else if (this->stride_w() == 2) {
667  REINTERLEAVE(2);
668  } else if (this->stride_w() == 3) {
669  REINTERLEAVE(3);
670  } else if (this->stride_w() == 4) {
671  REINTERLEAVE(4);
672  }
673 
674 #undef REINTERLEAVE
675 
676  Xdata += M * H * W;
677  Ydata += Y->size() / Y->dim32(0);
678  }
679  };
680  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
681  runWithSharedBuffer<Context>(ws_, f);
682  } else {
683  f(&threadBuffer_);
684  }
685 
686  return true;
687 }
688 
689 template <typename T, class Context>
690 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
691  CAFFE_THROW("Not implemented.");
692 }
693 
694 } // namespace caffe2
695 
696 #endif // CAFFE2_MOBILE
697 
698 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...