Caffe2 - C++ API
A deep learning, cross platform ML framework
math_cpu.cc
1 // Implements the math functions for CPU.
2 // The implementation in this file allows us to route the underlying numerical
3 // computation library to different backends. Notably:
4 // (1) For all BLAS-related functions, one can explicitly request a BLAS backend
5 // such as MKL, openblas or Atlas. To see the set of supported backends
6 // currently provided, check //third_party/blas/.
7 // (2) If one chooses to link against MKL, we utilize MKL's vector math library
8 // (VML) for a few functions such as Exp and Log.
9 // (3) Fallback implementations are provided in Eigen for cross-platform
10 // support. Since Eigen is a header-only library and supports a number of
11 // platforms, it allows one to quickly port Caffe2 to different platforms
12 // where BLAS may not be present.
13 
14 #include <algorithm>
15 #include <atomic>
16 #include <chrono>
17 #include <cstring>
18 #include <numeric>
19 #include <random>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "caffe2/utils/math.h"
24 #include "caffe2/utils/cpu_neon.h"
25 #include "caffe2/core/context.h"
26 #include "Eigen/Core"
27 #include "Eigen/Dense"
28 
29 #ifdef CAFFE2_USE_MKL
30 #include <mkl.h>
31 #endif // CAFFE2_USE_MKL
32 
33 #ifdef CAFFE2_USE_HPTT
34 #include <hptt.h>
35 #endif // CAFFE2_USE_HPTT
36 
37 #if defined(_MSC_VER)
38 #include <process.h>
39 #endif
40 
41 namespace caffe2 {
42 namespace math {
43 
45 // BLAS alternatives.
46 // Depending on whether we have specified an external BLAS library or not, we
47 // will delegate the Caffe math functions that are BLAS-related to either the
48 // CBLAS call or the Eigen implementation.
50 #ifdef CAFFE2_USE_EIGEN_FOR_BLAS
51 
52 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
53 // limitation that the data has to be contiguous in memory.
54 //
55 // The gemm call implements the following operation:
56 //
57 // C = alpha * op(A) * op(B) + beta * C
58 //
59 // where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
60 // of A, B, and C are matrices and alpha and beta are scalars. Note that the
61 // most common use case of gemm will involve setting alpha to 1 and beta to 0.
62 //
63 // op(A) and op(B) represent the transformations that are done to A and B before
64 // the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
65 // (transpose) if the argument TransA or TransB is set to CblasNoTrans or
66 // CblasTrans, respectively, for each of A and B.
67 template <>
68 void Gemm<float, CPUContext>(
69  const CBLAS_TRANSPOSE TransA,
70  const CBLAS_TRANSPOSE TransB,
71  const int M,
72  const int N,
73  const int K,
74  const float alpha,
75  const float* A,
76  const float* B,
77  const float beta,
78  float* C,
79  CPUContext* context,
80  TensorProto::DataType math_type) {
81  auto C_mat = EigenMatrixMap<float>(C, N, M);
82  if (beta == 0) {
83  C_mat.setZero();
84  } else {
85  C_mat *= beta;
86  }
87  switch (TransA) {
88  case CblasNoTrans: {
89  switch (TransB) {
90  case CblasNoTrans:
91  C_mat.noalias() += alpha * (
92  ConstEigenMatrixMap<float>(B, N, K) *
93  ConstEigenMatrixMap<float>(A, K, M));
94  return;
95  case CblasTrans:
96  C_mat.noalias() += alpha * (
97  ConstEigenMatrixMap<float>(B, K, N).transpose() *
98  ConstEigenMatrixMap<float>(A, K, M));
99  return;
100  default:
101  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
102  }
103  }
104  case CblasTrans: {
105  switch (TransB) {
106  case CblasNoTrans:
107  C_mat.noalias() += alpha * (
108  ConstEigenMatrixMap<float>(B, N, K) *
109  ConstEigenMatrixMap<float>(A, M, K).transpose());
110  return;
111  case CblasTrans:
112  C_mat.noalias() += alpha * (
113  ConstEigenMatrixMap<float>(B, K, N).transpose() *
114  ConstEigenMatrixMap<float>(A, M, K).transpose());
115  return;
116  default:
117  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
118  }
119  }
120  default:
121  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
122  }
123 }
124 
125 template <>
126 void GemmEx<float, CPUContext>(
127  const CBLAS_TRANSPOSE TransA,
128  const CBLAS_TRANSPOSE TransB,
129  const int M,
130  const int N,
131  const int K,
132  const float alpha,
133  const float* A,
134  const int lda,
135  const float* B,
136  const int ldb,
137  const float beta,
138  float* C,
139  const int ldc,
140  CPUContext*) {
141  using OuterStride = Eigen::OuterStride<Eigen::Dynamic>;
142  using StridedMap = Eigen::Map<Eigen::MatrixXf, 0, OuterStride>;
143  using ConstStridedMap = Eigen::Map<const Eigen::MatrixXf, 0, OuterStride>;
144  auto C_mat = StridedMap(C, N, M, OuterStride(ldc));
145  if (beta == 0) {
146  C_mat.setZero();
147  } else {
148  C_mat *= beta;
149  }
150  switch (TransA) {
151  case CblasNoTrans: {
152  switch (TransB) {
153  case CblasNoTrans:
154  C_mat.noalias() +=
155  alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
156  ConstStridedMap(A, K, M, OuterStride(lda)));
157  return;
158  case CblasTrans:
159  C_mat.noalias() +=
160  alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
161  ConstStridedMap(A, K, M, OuterStride(lda)));
162  return;
163  default:
164  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
165  }
166  }
167  case CblasTrans: {
168  switch (TransB) {
169  case CblasNoTrans:
170  C_mat.noalias() +=
171  alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
172  ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
173  return;
174  case CblasTrans:
175  C_mat.noalias() +=
176  alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
177  ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
178  return;
179  default:
180  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
181  }
182  }
183  default:
184  LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
185  }
186 }
187 
188 template <>
189 void Gemv<float, CPUContext>(
190  const CBLAS_TRANSPOSE TransA,
191  const int M,
192  const int N,
193  const float alpha,
194  const float* A,
195  const float* x,
196  const float beta,
197  float* y,
198  CPUContext* context,
199  TensorProto::DataType math_type) {
200  EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
201  if (beta == 0) {
202  // In Caffe2 we often do a lazy initialization, which may contain NaNs in
203  // the float values. As a result, if beta is 0, we explicitly do a setzero.
204  y_vec.setZero();
205  } else {
206  y_vec *= beta;
207  }
208  switch (TransA) {
209  case CblasNoTrans: {
210  y_vec.noalias() += alpha * (
211  ConstEigenMatrixMap<float>(A, N, M).transpose() *
212  ConstEigenVectorMap<float>(x, N));
213  return;
214  }
215  case CblasTrans: {
216  y_vec.noalias() += alpha * (
217  ConstEigenMatrixMap<float>(A, N, M) *
218  ConstEigenVectorMap<float>(x, M));
219  return;
220  }
221  default:
222  LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
223  }
224 }
225 
226 #define CAFFE2_SPECIALIZED_SCALE(T) \
227  template <> \
228  void Scale<T, CPUContext>( \
229  const int n, const float alpha, const T* x, T* y, CPUContext* context) { \
230  EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \
231  } \
232  template <> \
233  void Scale<T, CPUContext>( \
234  const int n, \
235  const float* alpha, \
236  const T* x, \
237  T* y, \
238  CPUContext* context) { \
239  EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha); \
240  }
241 CAFFE2_SPECIALIZED_SCALE(float)
242 #undef CAFFE2_SPECIALIZED_SCALE
243 
244 #define CAFFE2_SPECIALIZED_DOT(T) \
245 template<> \
246 void Dot<T, CPUContext>( \
247  const int N, const T* a, const T* b, T* y, \
248  CPUContext* context) { \
249  *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
250 }
251 CAFFE2_SPECIALIZED_DOT(float)
252 #undef CAFFE2_SPECIALIZED_DOT
253 
254 #define CAFFE2_SPECIALIZED_AXPY(T) \
255  template <> \
256  void Axpy<T, CPUContext>( \
257  const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \
258  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \
259  } \
260  template <> \
261  void Axpy<T, CPUContext>( \
262  const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
263  EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \
264  }
265 CAFFE2_SPECIALIZED_AXPY(float)
266 #undef CAFFE2_SPECIALIZED_AXPY
267 
268 #define CAFFE2_SPECIALIZED_AXPBY(T) \
269 template <> \
270 void Axpby<T, CPUContext>(const int N, const T alpha, const T* x, \
271  const T beta, T* y, CPUContext* context) { \
272  EigenVectorMap<T> y_vec(y, N); \
273  y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
274 }
275 CAFFE2_SPECIALIZED_AXPBY(float)
276 #undef CAFFE2_SPECIALIZED_AXPBY
277 
278 #else // CAFFE2_USE_EIGEN_FOR_BLAS
279 
280 template <>
281 void Gemm<float, CPUContext>(
282  const CBLAS_TRANSPOSE TransA,
283  const CBLAS_TRANSPOSE TransB,
284  const int M,
285  const int N,
286  const int K,
287  const float alpha,
288  const float* A,
289  const float* B,
290  const float beta,
291  float* C,
292  CPUContext* /*context*/,
293  TensorProto::DataType /*math_type*/) {
294  int lda = (TransA == CblasNoTrans) ? K : M;
295  int ldb = (TransB == CblasNoTrans) ? N : K;
296  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
297  beta, C, N);
298 }
299 
300 template <>
301 void GemmEx<float, CPUContext>(
302  const CBLAS_TRANSPOSE TransA,
303  const CBLAS_TRANSPOSE TransB,
304  const int M,
305  const int N,
306  const int K,
307  const float alpha,
308  const float* A,
309  const int lda,
310  const float* B,
311  const int ldb,
312  const float beta,
313  float* C,
314  const int ldc,
315  CPUContext* /*context*/) {
316  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
317  beta, C, ldc);
318 }
319 
320 template <>
321 void Gemv<float, CPUContext>(
322  const CBLAS_TRANSPOSE TransA,
323  const int M,
324  const int N,
325  const float alpha,
326  const float* A,
327  const float* x,
328  const float beta,
329  float* y,
330  CPUContext* /*context*/,
331  TensorProto::DataType /*math_type*/) {
332  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
333 }
334 
335 #define CAFFE2_SPECIALIZED_SCALE(T, prefix) \
336  template <> \
337  void Scale<T, CPUContext>( \
338  const int n, const float alpha, const T* x, T* y, CPUContext*) { \
339  if (y != x) \
340  cblas_##prefix##copy(n, x, 1, y, 1); \
341  cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1); \
342  } \
343  template <> \
344  void Scale<T, CPUContext>( \
345  const int n, const float* alpha, const T* x, T* y, CPUContext*) { \
346  if (y != x) \
347  cblas_##prefix##copy(n, x, 1, y, 1); \
348  cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1); \
349  }
350 CAFFE2_SPECIALIZED_SCALE(float, s)
351 #undef CAFFE2_SPECIALIZED_SCALE
352 
353 #define CAFFE2_SPECIALIZED_DOT(T, prefix) \
354  template <> \
355  void Dot<T, CPUContext>( \
356  const int N, const T* a, const T* b, T* y, CPUContext*) { \
357  *y = cblas_##prefix##dot(N, a, 1, b, 1); \
358  }
359 CAFFE2_SPECIALIZED_DOT(float, s)
360 #undef CAFFE2_SPECIALIZED_DOT
361 
362 #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
363  template <> \
364  void Axpy<T, CPUContext>( \
365  const int N, const T alpha, const T* x, T* y, CPUContext*) { \
366  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
367  } \
368  template <> \
369  void Axpy<T, CPUContext>( \
370  const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
371  cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
372  }
373 CAFFE2_SPECIALIZED_AXPY(float, s)
374 #undef CAFFE2_SPECIALIZED_AXPY
375 
376 // cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
377 // we will need to implement it.
378 #ifdef CAFFE2_USE_MKL
379 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
380  template <> \
381  void Axpby<T, CPUContext>( \
382  const int N, \
383  const T alpha, \
384  const T* x, \
385  const T beta, \
386  T* y, \
387  CPUContext*) { \
388  cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
389  }
390 #else // CAFFE2_USE_MKL
391 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
392  template <> \
393  void Axpby<T, CPUContext>( \
394  const int N, \
395  const T alpha, \
396  const T* x, \
397  const T beta, \
398  T* y, \
399  CPUContext*) { \
400  cblas_##prefix##scal(N, beta, y, 1); \
401  cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
402  }
403 #endif // CAFFE2_USE_MKL
404 CAFFE2_SPECIALIZED_AXPBY(float, s)
405 #undef CAFFE2_SPECIALIZED_AXPBY
406 
407 #endif // CAFFE2_USE_EIGEN_FOR_BLAS
408 
409 template <>
410 void GemmBatched<float, CPUContext>(
411  const CBLAS_TRANSPOSE TransA,
412  const CBLAS_TRANSPOSE TransB,
413  const int batch_size,
414  const int M,
415  const int N,
416  const int K,
417  const float alpha,
418  const float* A,
419  const float* B,
420  const float beta,
421  float* C,
422  CPUContext* context,
423  Tensor<CPUContext>*, /* scratch */
424  TensorProto::DataType /* math_type */) {
425  const int a_stride = M * K;
426  const int b_stride = K * N;
427  const int c_stride = M * N;
428 
429 #ifdef CAFFE2_USE_MKL
430  (void)context;
431 
432  const int lda = (TransA == CblasNoTrans) ? K : M;
433  const int ldb = (TransB == CblasNoTrans) ? N : K;
434  std::vector<const float*> a_array(batch_size, nullptr);
435  std::vector<const float*> b_array(batch_size, nullptr);
436  std::vector<float*> c_array(batch_size, nullptr);
437  for (int i = 0; i < batch_size; ++i) {
438  a_array[i] = A + a_stride * i;
439  b_array[i] = B + b_stride * i;
440  c_array[i] = C + c_stride * i;
441  }
442  cblas_sgemm_batch(
443  CblasRowMajor,
444  &TransA,
445  &TransB,
446  &M,
447  &N,
448  &K,
449  &alpha,
450  a_array.data(),
451  &lda,
452  b_array.data(),
453  &ldb,
454  &beta,
455  c_array.data(),
456  &N, // ldc_array
457  1,
458  &batch_size);
459 #else // CAFFE2_USE_MKL
460  // loop over matrices in the batch
461  for (int i = 0; i < batch_size; ++i) {
462  math::Gemm<float, CPUContext>(
463  TransA,
464  TransB,
465  M,
466  N,
467  K,
468  alpha,
469  A + a_stride * i,
470  B + b_stride * i,
471  beta,
472  C + c_stride * i,
473  context);
474  }
475 #endif
476 }
477 
479 // MKL VML alternatives.
480 // Depending on whether we are using MKL, we will delegate the Caffe math
481 // functions that are VML-related to either the VML call or the Eigen
482 // implementation. If you are setting the flags (such as AVX) right for your CPU
483 // architecture, usually Eigen will deliver a throughput as fast as the VML
484 // functions.
486 #ifdef CAFFE2_USE_MKL
487 
488 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
489  template <> \
490  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
491  OriginalFunc(N, x, y, ##__VA_ARGS__); \
492  }
493 DELEGATE_SIMPLE_UNARY_FUNCTION(
494  float,
495  Exp,
496  vmsExp,
497  VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
498 DELEGATE_SIMPLE_UNARY_FUNCTION(
499  double,
500  Exp,
501  vmdExp,
502  VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
503 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
504 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
505 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos)
506 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos)
507 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin)
508 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin)
509 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs)
510 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs)
511 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt)
512 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt)
513 DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, vsInvSqrt)
514 DELEGATE_SIMPLE_UNARY_FUNCTION(double, InvSqrt, vdInvSqrt)
515 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
516 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
517 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
518 
519 #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc) \
520  template <> \
521  void SinCos<T, CPUContext>( \
522  const int N, const T* a, T* ys, T* yc, CPUContext*) { \
523  OriginalFunc(N, a, ys, yc); \
524  }
525 DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
526 DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
527 #undef DELEGATE_SINCOS_FUNCTION
528 
529 #define DELEGATE_POWX_FUNCTION(T, OriginalFunc) \
530  template <> \
531  void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
532  OriginalFunc(N, a, b, y); \
533  }
534 DELEGATE_POWX_FUNCTION(float, vsPowx)
535 DELEGATE_POWX_FUNCTION(double, vdPowx)
536 #undef DELEGATE_POWX_FUNCTION
537 
538 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \
539  template <> \
540  void Funcname<T, CPUContext>( \
541  const int N, const T* a, const T* b, T* y, CPUContext*) { \
542  OriginalFunc(N, a, b, y); \
543  }
544 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
545 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
546 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
547 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
548 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
549 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
550 DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
551 DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
552 #undef DELEGATE_SIMPLE_BINARY_FUNCTION
553 
554 #else // CAFFE2_USE_MKL
555 
556 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
557  template <> \
558  void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
559  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr(); \
560  }
561 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
562 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
563 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos)
564 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin)
565 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs)
566 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt)
567 DELEGATE_SIMPLE_UNARY_FUNCTION(float, InvSqrt, rsqrt)
568 DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
569 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
570 
571 #define DELEGATE_SINCOS_FUNCTION(T) \
572  template <> \
573  void SinCos<T, CPUContext>( \
574  const int N, const T* x, T* ys, T* yc, CPUContext*) { \
575  EigenVectorMap<T>(ys, N) = ConstEigenVectorMap<T>(x, N).array().sin(); \
576  EigenVectorMap<T>(yc, N) = ConstEigenVectorMap<T>(x, N).array().cos(); \
577  }
578 DELEGATE_SINCOS_FUNCTION(float)
579 DELEGATE_SINCOS_FUNCTION(double)
580 #undef DELEGATE_SINCOS_FUNCTION
581 
582 #define DELEGATE_POWX_FUNCTION(T) \
583  template <> \
584  void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
585  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b); \
586  }
587 DELEGATE_POWX_FUNCTION(float)
588 #undef DELEGATE_POWX_FUNCTION
589 
590 #endif // CAFFE2_USE_MKL
591 
592 
593 #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr) \
594 template <> \
595 void Funcname<T, CPUContext>( \
596  const int N, const T* a, const T* b, T* y, \
597  CPUContext*) { \
598  EigenVectorMap<T>(y, N) = \
599  ConstEigenVectorMap<T>(a, N).array() expr \
600  ConstEigenVectorMap<T>(b, N).array(); \
601 }
602 
603 #ifdef CAFFE2_USE_MKL
604 
605 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \
606 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \
607 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
608 
609 #else
610 
611 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \
612 EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \
613 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \
614 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)
615 
616 #endif
617 
618 DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
619 DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
620 DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
621 DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
622 
623 #undef EIGEN_SIMPLE_BINARY_FUNCTION
624 #undef DEFINE_FLOAT_BINARY_FUNCTION
625 
626 
628 // Common math functions being used in Caffe that do not have a BLAS or MKL
629 // equivalent. For all these functions, we will simply implement them either via
630 // Eigen or via custom code.
632 
633 #define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
634  template <> \
635  void ReduceMin<T, CPUContext>( \
636  const int N, \
637  const T* x, \
638  T* y, \
639  Tensor<CPUContext>* /*scratch_ptr*/, \
640  CPUContext* /*context*/) { \
641  *y = *std::min_element(x, x + N); \
642  }
643 CAFFE2_SPECIALIZED_REDUCEMIN(float)
644 #undef CAFFE2_SPECIALIZED_REDUCEMIN
645 
646 #define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
647  template <> \
648  void ReduceMax<T, CPUContext>( \
649  const int N, \
650  const T* x, \
651  T* y, \
652  Tensor<CPUContext>* /*scratch_ptr*/, \
653  CPUContext* /*context*/) { \
654  *y = *std::max_element(x, x + N); \
655  }
656 CAFFE2_SPECIALIZED_REDUCEMAX(float)
657 CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
658 CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
659 
660 #undef CAFFE2_SPECIALIZED_REDUCEMAX
661 
662 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \
663  template <> \
664  void RowwiseMax<T, CPUContext>( \
665  const int N, const int D, const T* x, T* y, CPUContext*) { \
666  EigenVectorMap<T>(y, N) = \
667  ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \
668  }
669 CAFFE2_SPECIALIZED_ROWWISEMAX(float)
670 #undef CAFFE2_SPECIALIZED_ROWWISEMAX
671 
672 #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \
673  template <> \
674  void ColwiseMax<T, CPUContext>( \
675  const int N, const int D, const T* x, T* y, CPUContext*) { \
676  EigenVectorMap<T>(y, D) = \
677  ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \
678  }
679 CAFFE2_SPECIALIZED_COLWISEMAX(float)
680 #undef CAFFE2_SPECIALIZED_COLWISEMAX
681 
682 #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \
683  template <> \
684  void ElemwiseMax<T, CPUContext>( \
685  const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
686  std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \
687  return std::max(x_i, y_i); \
688  }); \
689  }
690 CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
691 #undef CAFFE2_SPECIALIZED_ELEMWISEMAX
692 
693 #define CAFFE2_SPECIALIZED_MAXIMUM(T) \
694  template <> \
695  void Maximum<T, CPUContext>( \
696  const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
697  std::transform( \
698  x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
699  }
700 CAFFE2_SPECIALIZED_MAXIMUM(float)
701 #undef CAFFE2_SPECIALIZED_MAXIMUM
702 
703 // AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a
704 // of shape M x N. The actual implementation uses eigen which is column major,
705 // so notice the row/column swap in the actual implementation.
706 #define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr) \
707  template <> \
708  void Funcname##ToRow<T, CPUContext>( \
709  const int M, const int N, const T* a, const T* b, T* y, CPUContext*) { \
710  EigenArrayMap<T>(y, N, M) = ConstEigenArrayMap<T>(a, N, M).colwise() \
711  expr ConstEigenVectorArrayMap<T>(b, N); \
712  } \
713  /* inplace versions */ \
714  template <> \
715  void Funcname##ToRow<T, CPUContext>( \
716  const int M, const int N, const T* x, T* y, CPUContext*) { \
717  EigenArrayMap<T>(y, N, M).colwise() expr## = \
718  ConstEigenVectorArrayMap<T>(x, N); \
719  } \
720  template <> \
721  void Funcname##ToCol<T, CPUContext>( \
722  const int M, const int N, const T* x, T* y, CPUContext*) { \
723  EigenArrayMap<T>(y, N, M).rowwise() expr## = \
724  ConstEigenVectorArrayMap<T>(x, M).transpose(); \
725  }
726 
727 #define DEFINE_BROADCAST_BINARY_FUNCTION(name, op) \
728  DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \
729  DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \
730  DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \
731 
732 DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
733 DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
734 DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *)
735 DEFINE_BROADCAST_BINARY_FUNCTION(Div, /)
736 
737 #undef DEFINE_BROADCAST_BINARY_FUNCTION
738 #undef DELEGATE_BROADCAST_BINARY_FUNCTION
739 
740 #define CAFFE2_SPECIALIZED_SET(T) \
741  template <> \
742  void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
743  if (alpha == (T)0) { \
744  if (Y != nullptr) { \
745  memset(Y, 0, N * sizeof(T)); \
746  } \
747  } else { \
748  EigenVectorMap<T>(Y, N).setConstant(alpha); \
749  } \
750  }
751 
752 CAFFE2_SPECIALIZED_SET(float);
753 CAFFE2_SPECIALIZED_SET(double);
754 CAFFE2_SPECIALIZED_SET(int8_t);
755 CAFFE2_SPECIALIZED_SET(int16_t);
756 CAFFE2_SPECIALIZED_SET(int);
757 CAFFE2_SPECIALIZED_SET(int64_t);
758 CAFFE2_SPECIALIZED_SET(bool);
759 CAFFE2_SPECIALIZED_SET(char);
760 CAFFE2_SPECIALIZED_SET(uint8_t);
761 CAFFE2_SPECIALIZED_SET(uint16_t);
762 #undef CAFFE2_SPECIALIZED_SET
763 
764 #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \
765  template <> \
766  void name<T, CPUContext>( \
767  const int n, const T* a, const T* b, bool* y, CPUContext*) { \
768  for (int i = 0; i < n; ++i) { \
769  y[i] = a[i] op b[i]; \
770  } \
771  } \
772  template <> \
773  void name##ToRow<T, CPUContext>( \
774  const int m, \
775  const int n, \
776  const T* a, \
777  const T* b, \
778  bool* y, \
779  CPUContext*) { \
780  for (int i = 0; i < n * m; ++i) { \
781  y[i] = a[i] op b[i % n]; \
782  } \
783  }
784 
785 #define CAFFE2_DEFINE_BINARY_OP(name, op) \
786  CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \
787  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
788  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)
789 
790 CAFFE2_DEFINE_BINARY_OP(LT, <);
791 CAFFE2_DEFINE_BINARY_OP(LE, <=);
792 CAFFE2_DEFINE_BINARY_OP(GT, >);
793 CAFFE2_DEFINE_BINARY_OP(GE, >=);
794 
795 CAFFE2_INSTANTIATE_BINARY_OP(Or, |, bool);
796 CAFFE2_INSTANTIATE_BINARY_OP(And, &, bool);
797 CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^, bool);
798 
799 template <>
800 void Not<bool, CPUContext>(
801  const int n,
802  const bool* x,
803  bool* y,
804  CPUContext* /*context*/) {
805  for (int i = 0; i < n; ++i) {
806  y[i] = !x[i];
807  }
808 }
809 
810 #undef CAFFE2_DEFINE_BINARY_OP
811 #undef CAFFE2_INSTANTIATE_BINARY_OP
812 
813 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \
814  template <> \
815  void AddStripedBatch( \
816  const int N, \
817  const T* first, \
818  T* y, \
819  const int stripe, \
820  const int batch, \
821  CPUContext* context) { \
822  for (int j = 0; j < batch; j++) { \
823  Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
824  } \
825  }
826 
827 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
828 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
829 
830 template <>
831 void RandUniform<float, CPUContext>(
832  const size_t n,
833  const float a,
834  const float b,
835  float* r,
836  CPUContext* context) {
837  std::uniform_real_distribution<float> distribution(a, b);
838  for (auto i = 0; i < n; ++i) {
839  r[i] = distribution(context->RandGenerator());
840  }
841 }
842 
843 template <>
844 void RandUniform<int, CPUContext>(
845  const size_t n,
846  const int a,
847  const int b,
848  int* r,
849  CPUContext* context) {
850  std::uniform_int_distribution<int> distribution(a, b);
851  for (auto i = 0; i < n; ++i) {
852  r[i] = distribution(context->RandGenerator());
853  }
854 }
855 
856 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \
857  template <> \
858  void RandUniformUnique<T, CPUContext>( \
859  const size_t n, \
860  const T a, \
861  const T b, \
862  T* r, \
863  const size_t m, \
864  const T* avoid, \
865  CPUContext* context) { \
866  CAFFE_ENFORCE_LE( \
867  n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
868  std::unordered_set<T> avoid_set(n); \
869  if (m) { \
870  avoid_set.insert(avoid, avoid + m); \
871  CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \
872  } \
873  std::uniform_int_distribution<T> distribution(a, b); \
874  T v = 0; \
875  for (size_t i = 0; i < n; ++i) { \
876  do { \
877  v = distribution(context->RandGenerator()); \
878  } while (avoid_set.count(v)); \
879  r[i] = v; \
880  avoid_set.insert(v); \
881  } \
882  }
883 
884 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
885 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
886 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
887 
888 template <>
889 void RandGaussian<float, CPUContext>(
890  const size_t n,
891  const float mean,
892  const float std,
893  float* r,
894  CPUContext* context) {
895  std::normal_distribution<float> distribution(mean, std);
896  for (auto i = 0; i < n; ++i) {
897  r[i] = distribution(context->RandGenerator());
898  }
899 }
900 
901 #define CAFFE2_SPECIALIZED_SUM(T) \
902  template <> \
903  void Sum<T, CPUContext>( \
904  const int N, \
905  const T* x, \
906  T* y, \
907  CPUContext* /* unused */, \
908  Tensor<CPUContext>* /* unused */) { \
909  *y = ConstEigenVectorMap<T>(x, N).sum(); \
910  }
911 
912 CAFFE2_SPECIALIZED_SUM(float);
913 CAFFE2_SPECIALIZED_SUM(int32_t);
914 CAFFE2_SPECIALIZED_SUM(int64_t);
915 
916 #undef CAFFE2_SPECIALIZED_SUM
917 
918 template <>
919 void SumSqr<float, CPUContext>(
920  const int N,
921  const float* x,
922  float* y,
923  CPUContext* /*context*/ /* unused */,
924  Tensor<CPUContext>* /*scratch_ptr*/ /* unused */) {
925  *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
926 }
927 
928 template <>
929 void Select<float, CPUContext>(
930  const int N,
931  const int D,
932  const float* x,
933  const int* idx,
934  float* y,
935  CPUContext* /*context*/) {
936  for (int i = 0; i < N; ++i) {
937  DCHECK_LT(idx[i], D);
938  y[i] = x[i * D + idx[i]];
939  }
940 }
941 // Ported from caffe 1.
942 template <>
943 void Im2colNd<float, CPUContext, StorageOrder::NCHW>(
944  const float* data_img,
945  const int* im_shape,
946  const int* col_shape,
947  const int /* img_size*/,
948  const int /* col_size*/,
949  const int* kernel_shape,
950  const int* stride,
951  const int* dilation,
952  const int* pad,
953  const int N,
954  float* data_col,
955  CPUContext* /* context */,
956  bool accumulate_output) {
957  int kernel_size = 1;
958  for (int i = 0; i < N; ++i) {
959  kernel_size *= kernel_shape[i];
960  }
961  const int channels_col = col_shape[0];
962  vector<int> d_offset(N, 0);
963  vector<int> d_iter(N, 0);
964  for (int c_col = 0; c_col < channels_col; ++c_col) {
965  // Loop over spatial axes in reverse order to compute a per-axis offset.
966  int offset = c_col;
967  for (int d_i = N - 1; d_i >= 0; --d_i) {
968  if (d_i < N - 1) {
969  offset /= kernel_shape[d_i + 1];
970  }
971  d_offset[d_i] = offset % kernel_shape[d_i];
972  }
973  for (bool incremented = true; incremented;) {
974  // Loop over spatial axes in forward order to compute the indices in the
975  // image and column, and whether the index lies in the padding.
976  int index_col = c_col;
977  int index_im = c_col / kernel_size;
978  bool is_padding = false;
979  for (int d_i = 0; d_i < N; ++d_i) {
980  const int d = d_iter[d_i];
981  const int d_im =
982  d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
983  is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
984  index_col *= col_shape[d_i + 1];
985  index_col += d;
986  index_im *= im_shape[d_i + 1];
987  index_im += d_im;
988  }
989  if (!accumulate_output) {
990  if (is_padding) {
991  data_col[index_col] = 0;
992  } else {
993  data_col[index_col] = data_img[index_im];
994  }
995  } else if (!is_padding) { // col2im
996  data_col[index_im] += data_img[index_col];
997  }
998  // Loop over spatial axes in reverse order to choose an index,
999  // like counting.
1000  incremented = false;
1001  for (int d_i = N - 1; d_i >= 0; --d_i) {
1002  const int d_max = col_shape[d_i + 1];
1003  DCHECK_LT(d_iter[d_i], d_max);
1004  if (d_iter[d_i] == d_max - 1) {
1005  d_iter[d_i] = 0;
1006  } else { // d_iter[d_i] < d_max - 1
1007  ++d_iter[d_i];
1008  incremented = true;
1009  break;
1010  }
1011  }
1012  } // while(incremented) {
1013  } // for (int c = 0; c < channels_col; ++c) {
1014 }
1015 
1016 template <>
1017 void Col2imNd<float, CPUContext, StorageOrder::NCHW>(
1018  const float* data_col,
1019  const int* img_shape,
1020  const int* col_shape,
1021  const int img_size,
1022  const int col_size,
1023  const int* kernel_shape,
1024  const int* stride,
1025  const int* dilation,
1026  const int* pad,
1027  const int N,
1028  float* data_img,
1029  CPUContext* context) {
1030  Set<float, CPUContext>(img_size, 0, data_img, context);
1031  Im2colNd<float, CPUContext, StorageOrder::NCHW>(
1032  data_col,
1033  img_shape,
1034  col_shape,
1035  img_size,
1036  col_size,
1037  kernel_shape,
1038  stride,
1039  dilation,
1040  pad,
1041  N,
1042  data_img,
1043  context,
1044  true);
1045 }
1046 
1047 template <>
1048 void Im2col<float, CPUContext, StorageOrder::NCHW>(
1049  const float* data_im,
1050  const int channels,
1051  const int height,
1052  const int width,
1053  const int kernel_h,
1054  const int kernel_w,
1055  const int dilation_h,
1056  const int dilation_w,
1057  const int pad_t,
1058  const int pad_l,
1059  const int pad_b,
1060  const int pad_r,
1061  const int stride_h,
1062  const int stride_w,
1063  float* data_col,
1064  CPUContext* /*context*/) {
1065  const int output_h =
1066  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1067  1;
1068  const int output_w =
1069  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1070  1;
1071 
1072  // Fast path for zero padding and no dilation
1073  // From Torch, THNN_(unfolded_copy)
1074  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1075  pad_t == 0 && pad_b == 0) {
1076  for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1077  const auto nip = k / (kernel_h * kernel_w);
1078  const auto rest = k % (kernel_h * kernel_w);
1079  const auto kh = rest / kernel_w;
1080  const auto kw = rest % kernel_w;
1081  auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
1082  kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1083  const auto* src = data_im + nip * (height * width);
1084  for (auto y = 0; y < output_h; y++) {
1085  const auto iy = y * stride_h + kh;
1086  const auto ix = kw;
1087  if (stride_w == 1) {
1088  memcpy(
1089  dst + (y * output_w),
1090  src + (iy * width + ix),
1091  sizeof(float) * output_w);
1092  } else {
1093  for (auto x = 0; x < output_w; x++) {
1094  memcpy(
1095  dst + (y * output_w + x),
1096  src + (iy * width + ix + x * stride_w),
1097  sizeof(float));
1098  }
1099  }
1100  }
1101  }
1102  return;
1103  }
1104 
1105  // Fast path for equal padding
1106  if (pad_l == pad_r && pad_t == pad_b) {
1107  // From Intel, https://github.com/BVLC/caffe/pull/3536
1108  const int pad_h = pad_t;
1109  const int pad_w = pad_l;
1110  const int channel_size = height * width;
1111  for (int channel = channels; channel--; data_im += channel_size) {
1112  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1113  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1114  int input_row = -pad_h + kernel_row * dilation_h;
1115  for (int output_rows = output_h; output_rows; output_rows--) {
1116  if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1117  for (int output_cols = output_w; output_cols; output_cols--) {
1118  *(data_col++) = 0;
1119  }
1120  } else {
1121  int input_col = -pad_w + kernel_col * dilation_w;
1122  for (int output_col = output_w; output_col; output_col--) {
1123  if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1124  *(data_col++) = data_im[input_row * width + input_col];
1125  } else {
1126  *(data_col++) = 0;
1127  }
1128  input_col += stride_w;
1129  }
1130  }
1131  input_row += stride_h;
1132  }
1133  }
1134  }
1135  }
1136  return;
1137  }
1138 
1139  // Baseline
1140  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1141  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1142 
1143  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1144  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1145 
1146  int channels_col = channels * kernel_h * kernel_w;
1147  for (int c = 0; c < channels_col; ++c) {
1148  int w_offset = c % kernel_w;
1149  int h_offset = (c / kernel_w) % kernel_h;
1150  int c_im = c / kernel_h / kernel_w;
1151  for (int h = 0; h < height_col; ++h) {
1152  for (int w = 0; w < width_col; ++w) {
1153  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1154  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1155  if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
1156  data_col[(c * height_col + h) * width_col + w] =
1157  data_im[(c_im * height + h_pad) * width + w_pad];
1158  else
1159  data_col[(c * height_col + h) * width_col + w] = 0;
1160  }
1161  }
1162  }
1163 }
1164 
1165 template <>
1166 void Im2col<float, CPUContext, StorageOrder::NHWC>(
1167  const float* data_im,
1168  const int channels,
1169  const int height,
1170  const int width,
1171  const int kernel_h,
1172  const int kernel_w,
1173  const int dilation_h,
1174  const int dilation_w,
1175  const int pad_t,
1176  const int pad_l,
1177  const int pad_b,
1178  const int pad_r,
1179  const int stride_h,
1180  const int stride_w,
1181  float* data_col,
1182  CPUContext* /*context*/) {
1183  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1184  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1185 
1186  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1187  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1188 
1189  int h_pad = -pad_t;
1190  for (int h = 0; h < height_col; ++h) {
1191  int w_pad = -pad_l;
1192  for (int w = 0; w < width_col; ++w) {
1193  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1194  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1195  if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1196  memcpy(data_col, data_im + (ih * width + iw) * channels,
1197  sizeof(float) * channels);
1198  } else {
1199  // This should be simply padded with zero.
1200  memset(data_col, 0, sizeof(float) * channels);
1201  }
1202  data_col += channels;
1203  }
1204  }
1205  w_pad += stride_w;
1206  }
1207  h_pad += stride_h;
1208  }
1209 }
1210 
1211 template <>
1212 void Col2im<float, CPUContext, StorageOrder::NCHW>(
1213  const float* data_col,
1214  const int channels,
1215  const int height,
1216  const int width,
1217  const int kernel_h,
1218  const int kernel_w,
1219  const int dilation_h,
1220  const int dilation_w,
1221  const int pad_t,
1222  const int pad_l,
1223  const int pad_b,
1224  const int pad_r,
1225  const int stride_h,
1226  const int stride_w,
1227  float* data_im,
1228  CPUContext* context) {
1229  const int output_h =
1230  (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1231  1;
1232  const int output_w =
1233  (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1234  1;
1235 
1236  Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1237 
1238  // Fast path for zero padding and no dilation
1239  // From Torch, modified THNN_(unfolded_acc)
1240  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1241  pad_t == 0 && pad_b == 0) {
1242  for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1243  const auto nip = k / (kernel_h * kernel_w);
1244  const auto rest = k % (kernel_h * kernel_w);
1245  const auto kh = rest / kernel_w;
1246  const auto kw = rest % kernel_w;
1247  const auto* dst = data_col +
1248  nip * (kernel_h * kernel_w * output_h * output_w) +
1249  kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1250  auto* src = data_im + nip * (height * width);
1251  for (auto y = 0; y < output_h; y++) {
1252  const auto iy = y * stride_h + kh;
1253  const auto ix = kw;
1254  if (stride_w == 1) {
1255  auto offsrc = src + (iy * width + ix);
1256  const auto offdst = dst + (y * output_w);
1257  for (auto i = 0; i < output_w; ++i) {
1258  offsrc[i] += offdst[i];
1259  }
1260  } else {
1261  for (auto x = 0; x < output_w; x++) {
1262  auto offsrc = src + (iy * width + ix + x * stride_w);
1263  const auto offdst = dst + (y * output_w + x);
1264  *offsrc += *offdst;
1265  }
1266  }
1267  }
1268  }
1269  return;
1270  }
1271 
1272  // Fast path for equal padding
1273  if (pad_l == pad_r && pad_t == pad_b) {
1274  // From Intel, https://github.com/BVLC/caffe/pull/3536
1275  const int pad_h = pad_t;
1276  const int pad_w = pad_l;
1277  const int channel_size = height * width;
1278  for (int channel = channels; channel--; data_im += channel_size) {
1279  for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1280  for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1281  int input_row = -pad_h + kernel_row * dilation_h;
1282  for (int output_rows = output_h; output_rows; output_rows--) {
1283  if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1284  data_col += output_w;
1285  } else {
1286  int input_col = -pad_w + kernel_col * dilation_w;
1287  for (int output_col = output_w; output_col; output_col--) {
1288  if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1289  data_im[input_row * width + input_col] += *data_col;
1290  }
1291  data_col++;
1292  input_col += stride_w;
1293  }
1294  }
1295  input_row += stride_h;
1296  }
1297  }
1298  }
1299  }
1300  return;
1301  }
1302 
1303  // Fallback
1304  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1305  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1306 
1307  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1308  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1309  int channels_col = channels * kernel_h * kernel_w;
1310  for (int c = 0; c < channels_col; ++c) {
1311  int w_offset = c % kernel_w;
1312  int h_offset = (c / kernel_w) % kernel_h;
1313  int c_im = c / kernel_h / kernel_w;
1314  for (int h = 0; h < height_col; ++h) {
1315  for (int w = 0; w < width_col; ++w) {
1316  int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1317  int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1318  if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
1319  data_im[(c_im * height + h_pad) * width + w_pad] +=
1320  data_col[(c * height_col + h) * width_col + w];
1321  }
1322  }
1323  }
1324  }
1325 }
1326 
1327 template <>
1328 void Col2im<float, CPUContext, StorageOrder::NHWC>(
1329  const float* data_col,
1330  const int channels,
1331  const int height,
1332  const int width,
1333  const int kernel_h,
1334  const int kernel_w,
1335  const int dilation_h,
1336  const int dilation_w,
1337  const int pad_t,
1338  const int pad_l,
1339  const int pad_b,
1340  const int pad_r,
1341  const int stride_h,
1342  const int stride_w,
1343  float* data_im,
1344  CPUContext* context) {
1345  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1346  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1347 
1348  Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1349  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1350  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1351  int h_pad = -pad_t;
1352  for (int h = 0; h < height_col; ++h) {
1353  int w_pad = -pad_l;
1354  for (int w = 0; w < width_col; ++w) {
1355  for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1356  for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1357  if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1358  auto* data_im_patch = data_im + (ih * width + iw) * channels;
1359  Add<float, CPUContext>(
1360  channels, data_im_patch, data_col, data_im_patch, context);
1361  }
1362  data_col += channels;
1363  }
1364  }
1365  w_pad += stride_w;
1366  }
1367  h_pad += stride_h;
1368  }
1369 }
1370 
1371 template <>
1372 void BiasCHW<float, CPUContext>(
1373  const float* bias,
1374  const int bias_channels,
1375  const int image_size,
1376  float* image,
1377  CPUContext* /*context*/) {
1378  // Sum the per-channel bias into every image plane
1379  for (int c = 0; c < bias_channels; ++c) {
1380  float b = bias[c];
1381 
1382 #ifdef __ARM_NEON__
1383  float32x4_t vBias = vdupq_n_f32(b);
1384 
1385  // We give alignment hints for additional speed, so handle the
1386  // non-vectorizable prologue separately
1387  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
1388 
1389  // FIXME: if input < kVecSizeInFloat, can't vectorize at all
1390 
1391  int prologue =
1392  kVecSizeInFloat -
1393  // remainder in floats
1394  (((uintptr_t) image) % (sizeof(float32x4_t))) / sizeof(float);
1395 
1396  int i = 0;
1397  // Prologue loop
1398  for (; i < prologue; ++i) {
1399  image[i] += b;
1400  }
1401 
1402  // The loop is manually unrolled by 8
1403  constexpr int kUnroll = 8;
1404  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
1405 
1406  int remainder = image_size - prologue;
1407  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
1408 
1409  // Vectorizable body
1410  for (; i < vectorizable; i += kFloatsPerLoop) {
1411  // Manually unrolled
1412  float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
1413  float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
1414  float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
1415  float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
1416  float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
1417  float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
1418  float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
1419  float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
1420 
1421  v0 = vaddq_f32(v0, vBias);
1422  v1 = vaddq_f32(v1, vBias);
1423  v2 = vaddq_f32(v2, vBias);
1424  v3 = vaddq_f32(v3, vBias);
1425  v4 = vaddq_f32(v4, vBias);
1426  v5 = vaddq_f32(v5, vBias);
1427  v6 = vaddq_f32(v6, vBias);
1428  v7 = vaddq_f32(v7, vBias);
1429 
1430  vst1q_f32_aligned(image + i + 0, v0);
1431  vst1q_f32_aligned(image + i + 4, v1);
1432  vst1q_f32_aligned(image + i + 8, v2);
1433  vst1q_f32_aligned(image + i + 12, v3);
1434  vst1q_f32_aligned(image + i + 16, v4);
1435  vst1q_f32_aligned(image + i + 20, v5);
1436  vst1q_f32_aligned(image + i + 24, v6);
1437  vst1q_f32_aligned(image + i + 28, v7);
1438  }
1439 
1440  // Non-vectorizable epilogue
1441  for (; i < image_size; ++i) {
1442  image[i] += b;
1443  }
1444 #else
1445  // Non-NEON CPU implementation
1446  for (int i = 0; i < image_size; ++i) {
1447  image[i] += b;
1448  }
1449 #endif // __ARM_NEON__
1450 
1451  image += image_size;
1452  }
1453 }
1454 
1455 template <>
1456 void CopyMatrix<CPUContext>(
1457  const size_t itemsize,
1458  const int M,
1459  const int N,
1460  const void* A,
1461  const int lda,
1462  void* B,
1463  const int ldb,
1464  CPUContext* /*context*/,
1465  TypeMeta::TypedCopy copy) {
1466  if (A == nullptr || B == nullptr) {
1467  return;
1468  }
1469  if (lda == N && ldb == N) {
1470  // can coalese to a single memcpy of size M * N
1471  if (copy) {
1472  copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
1473  } else {
1474  memcpy(
1475  static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
1476  }
1477  return;
1478  }
1479 
1480  for (int i = 0; i < M; ++i) {
1481  if (copy) {
1482  copy(
1483  static_cast<const char*>(A) + lda * i * itemsize,
1484  static_cast<char*>(B) + ldb * i * itemsize,
1485  N);
1486  } else {
1487  memcpy(
1488  static_cast<char*>(B) + ldb * i * itemsize,
1489  static_cast<const char*>(A) + lda * i * itemsize,
1490  itemsize * N);
1491  }
1492  }
1493 }
1494 
1495 #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \
1496  template <> \
1497  void CopyVector<T, CPUContext>( \
1498  const int N, const T* src, T* dst, CPUContext* /*context*/) { \
1499  if (src != dst && N > 0) { \
1500  memcpy(dst, src, sizeof(T) * N); \
1501  } \
1502  }
1503 CAFFE2_SPECIALIZED_COPYVECTOR(float)
1504 #undef CAFFE2_SPECIALIZED_COPYVECTOR
1505 
1506 namespace {
1507 
1508 #ifdef CAFFE2_USE_HPTT
1509 
1510 bool TryTransposeWithHPTT(
1511  const int num_axes,
1512  const int* dims,
1513  const int* axes,
1514  const float* X,
1515  float* Y) {
1516  std::vector<int> axes_cm(num_axes);
1517  std::vector<int> dims_cm(num_axes);
1518 
1519  // Convert row-major index to column-major.
1520  const auto cm_fn = [num_axes](const int i) { return num_axes - i - 1; };
1521  for (int i = 0; i < num_axes; ++i) {
1522  axes_cm[i] = cm_fn(axes[cm_fn(i)]);
1523  dims_cm[i] = dims[cm_fn(i)];
1524  }
1525  auto plan = hptt::create_plan(
1526  axes_cm.data(),
1527  num_axes,
1528  1.0,
1529  X,
1530  dims_cm.data(),
1531  nullptr,
1532  0.0,
1533  Y,
1534  nullptr,
1535  hptt::ESTIMATE,
1536  1);
1537  if (plan == nullptr) {
1538  return false;
1539  }
1540  plan->execute();
1541  return true;
1542 }
1543 
1544 #endif // CAFFE2_USE_HPTT
1545 
1546 std::vector<int>
1547 ComputeXStrides(const int num_axes, const int* dims, const int* axes) {
1548  std::vector<int> x_strides(num_axes);
1549  std::vector<int> buff(num_axes);
1550  int cur_stride = 1;
1551  for (int i = num_axes - 1; i >= 0; --i) {
1552  buff[i] = cur_stride;
1553  cur_stride *= dims[i];
1554  }
1555  for (int i = 0; i < num_axes; ++i) {
1556  x_strides[i] = buff[axes[i]];
1557  }
1558  return x_strides;
1559 }
1560 
1561 void IncreaseIndex(const int* dims, std::vector<int>* index) {
1562  for (int i = index->size() - 1; i >= 0; --i) {
1563  ++index->at(i);
1564  if (index->at(i) >= dims[i]) {
1565  index->at(i) -= dims[i];
1566  } else {
1567  break;
1568  }
1569  }
1570 }
1571 
1572 template <typename T>
1573 void TransposeCPU(
1574  const int num_axes,
1575  const int* x_dims,
1576  const int* y_dims,
1577  const int* axes,
1578  const int data_size,
1579  const T* X,
1580  T* Y) {
1581  // Measure amount of contiguous data we can copy at once
1582  int block_size = 1;
1583  int num_shared_idxs = 0;
1584  for (int i = num_axes - 1; i >= 0 && axes[i] == i; --i) {
1585  block_size *= y_dims[i];
1586  ++num_shared_idxs;
1587  }
1588 
1589  if (num_axes < 2 || num_shared_idxs == num_axes) {
1590  memcpy(Y, X, data_size * sizeof(T));
1591  return;
1592  }
1593 
1594  const int itr_axes = num_axes - num_shared_idxs;
1595  const std::vector<int> x_strides = ComputeXStrides(itr_axes, x_dims, axes);
1596  std::vector<int> index_digits(itr_axes, 0);
1597  const int num_blocks = data_size / block_size;
1598  for (int y_index = 0; y_index < num_blocks; ++y_index) {
1599  const int x_index = std::inner_product(
1600  x_strides.cbegin(), x_strides.cend(), index_digits.cbegin(), 0);
1601  if (block_size == 1) {
1602  Y[y_index] = X[x_index];
1603  } else {
1604  memcpy(
1605  Y + block_size * y_index,
1606  X + block_size * x_index,
1607  block_size * sizeof(T));
1608  }
1609  IncreaseIndex(y_dims, &index_digits);
1610  }
1611 }
1612 
1613 } // namespace
1614 
1615 template <>
1616 void Transpose<float, CPUContext>(
1617  const int num_axes,
1618  const int* x_dims,
1619  const int* y_dims,
1620  const int* axes,
1621  const int data_size,
1622  const float* X,
1623  float* Y,
1624  CPUContext* /* context */) {
1625 #ifdef CAFFE2_USE_HPTT
1626  if (TryTransposeWithHPTT(num_axes, x_dims, axes, X, Y)) {
1627  return;
1628  }
1629 #endif // CAFFE2_USE_HPTT
1630  TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y);
1631 }
1632 
1633 #define CAFFE2_SPECIALIZED_TRANSPOSE(T) \
1634  template <> \
1635  void Transpose<T, CPUContext>( \
1636  const int num_axes, \
1637  const int* x_dims, \
1638  const int* y_dims, \
1639  const int* axes, \
1640  const int data_size, \
1641  const T* X, \
1642  T* Y, \
1643  CPUContext* /* context */) { \
1644  TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y); \
1645  }
1646 CAFFE2_SPECIALIZED_TRANSPOSE(double)
1647 CAFFE2_SPECIALIZED_TRANSPOSE(int)
1648 CAFFE2_SPECIALIZED_TRANSPOSE(long)
1649 #undef CAFFE2_SPECIALIZED_TRANSPOSE
1650 
1651 } // namespace math
1652 } // namespace caffe2
Definition: types.h:72
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...