20 #include <unordered_set> 23 #include "caffe2/utils/math.h" 24 #include "caffe2/utils/cpu_neon.h" 25 #include "caffe2/core/context.h" 27 #include "Eigen/Dense" 31 #endif // CAFFE2_USE_MKL 33 #ifdef CAFFE2_USE_HPTT 35 #endif // CAFFE2_USE_HPTT 50 #ifdef CAFFE2_USE_EIGEN_FOR_BLAS 68 void Gemm<float, CPUContext>(
69 const CBLAS_TRANSPOSE TransA,
70 const CBLAS_TRANSPOSE TransB,
80 TensorProto::DataType math_type) {
81 auto C_mat = EigenMatrixMap<float>(C, N, M);
91 C_mat.noalias() += alpha * (
92 ConstEigenMatrixMap<float>(B, N, K) *
93 ConstEigenMatrixMap<float>(A, K, M));
96 C_mat.noalias() += alpha * (
97 ConstEigenMatrixMap<float>(B, K, N).transpose() *
98 ConstEigenMatrixMap<float>(A, K, M));
101 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransB";
107 C_mat.noalias() += alpha * (
108 ConstEigenMatrixMap<float>(B, N, K) *
109 ConstEigenMatrixMap<float>(A, M, K).transpose());
112 C_mat.noalias() += alpha * (
113 ConstEigenMatrixMap<float>(B, K, N).transpose() *
114 ConstEigenMatrixMap<float>(A, M, K).transpose());
117 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransB";
121 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransA";
126 void GemmEx<float, CPUContext>(
127 const CBLAS_TRANSPOSE TransA,
128 const CBLAS_TRANSPOSE TransB,
141 using OuterStride = Eigen::OuterStride<Eigen::Dynamic>;
142 using StridedMap = Eigen::Map<Eigen::MatrixXf, 0, OuterStride>;
143 using ConstStridedMap = Eigen::Map<const Eigen::MatrixXf, 0, OuterStride>;
144 auto C_mat = StridedMap(C, N, M, OuterStride(ldc));
155 alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
156 ConstStridedMap(A, K, M, OuterStride(lda)));
160 alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
161 ConstStridedMap(A, K, M, OuterStride(lda)));
164 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransB";
171 alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
172 ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
176 alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
177 ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
180 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransB";
184 LOG(FATAL) <<
"Unexpected CBLAS_TRANSPOSE for TransA";
189 void Gemv<float, CPUContext>(
190 const CBLAS_TRANSPOSE TransA,
199 TensorProto::DataType math_type) {
200 EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
210 y_vec.noalias() += alpha * (
211 ConstEigenMatrixMap<float>(A, N, M).transpose() *
212 ConstEigenVectorMap<float>(x, N));
216 y_vec.noalias() += alpha * (
217 ConstEigenMatrixMap<float>(A, N, M) *
218 ConstEigenVectorMap<float>(x, M));
222 LOG(FATAL) <<
"Gemv float found an unexpected CBLAS_TRANSPOSE input.";
226 #define CAFFE2_SPECIALIZED_SCALE(T) \ 228 void Scale<T, CPUContext>( \ 229 const int n, const float alpha, const T* x, T* y, CPUContext* context) { \ 230 EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha; \ 233 void Scale<T, CPUContext>( \ 235 const float* alpha, \ 238 CPUContext* context) { \ 239 EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha); \ 241 CAFFE2_SPECIALIZED_SCALE(
float)
242 #undef CAFFE2_SPECIALIZED_SCALE 244 #define CAFFE2_SPECIALIZED_DOT(T) \ 246 void Dot<T, CPUContext>( \ 247 const int N, const T* a, const T* b, T* y, \ 248 CPUContext* context) { \ 249 *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \ 251 CAFFE2_SPECIALIZED_DOT(
float)
252 #undef CAFFE2_SPECIALIZED_DOT 254 #define CAFFE2_SPECIALIZED_AXPY(T) \ 256 void Axpy<T, CPUContext>( \ 257 const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \ 258 EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \ 261 void Axpy<T, CPUContext>( \ 262 const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \ 263 EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \ 265 CAFFE2_SPECIALIZED_AXPY(
float)
266 #undef CAFFE2_SPECIALIZED_AXPY 268 #define CAFFE2_SPECIALIZED_AXPBY(T) \ 270 void Axpby<T, CPUContext>(const int N, const T alpha, const T* x, \ 271 const T beta, T* y, CPUContext* context) { \ 272 EigenVectorMap<T> y_vec(y, N); \ 273 y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \ 275 CAFFE2_SPECIALIZED_AXPBY(
float)
276 #undef CAFFE2_SPECIALIZED_AXPBY 278 #else // CAFFE2_USE_EIGEN_FOR_BLAS 281 void Gemm<float, CPUContext>(
282 const CBLAS_TRANSPOSE TransA,
283 const CBLAS_TRANSPOSE TransB,
293 TensorProto::DataType ) {
294 int lda = (TransA == CblasNoTrans) ? K : M;
295 int ldb = (TransB == CblasNoTrans) ? N : K;
296 cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
301 void GemmEx<float, CPUContext>(
302 const CBLAS_TRANSPOSE TransA,
303 const CBLAS_TRANSPOSE TransB,
316 cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
321 void Gemv<float, CPUContext>(
322 const CBLAS_TRANSPOSE TransA,
331 TensorProto::DataType ) {
332 cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
335 #define CAFFE2_SPECIALIZED_SCALE(T, prefix) \ 337 void Scale<T, CPUContext>( \ 338 const int n, const float alpha, const T* x, T* y, CPUContext*) { \ 340 cblas_##prefix##copy(n, x, 1, y, 1); \ 341 cblas_##prefix##scal(n, static_cast<float>(alpha), y, 1); \ 344 void Scale<T, CPUContext>( \ 345 const int n, const float* alpha, const T* x, T* y, CPUContext*) { \ 347 cblas_##prefix##copy(n, x, 1, y, 1); \ 348 cblas_##prefix##scal(n, static_cast<float>(*alpha), y, 1); \ 350 CAFFE2_SPECIALIZED_SCALE(
float, s)
351 #undef CAFFE2_SPECIALIZED_SCALE 353 #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ 355 void Dot<T, CPUContext>( \ 356 const int N, const T* a, const T* b, T* y, CPUContext*) { \ 357 *y = cblas_##prefix##dot(N, a, 1, b, 1); \ 359 CAFFE2_SPECIALIZED_DOT(
float, s)
360 #undef CAFFE2_SPECIALIZED_DOT 362 #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ 364 void Axpy<T, CPUContext>( \ 365 const int N, const T alpha, const T* x, T* y, CPUContext*) { \ 366 cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ 369 void Axpy<T, CPUContext>( \ 370 const int N, const T* alpha, const T* x, T* y, CPUContext*) { \ 371 cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ 373 CAFFE2_SPECIALIZED_AXPY(
float, s)
374 #undef CAFFE2_SPECIALIZED_AXPY 378 #ifdef CAFFE2_USE_MKL 379 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ 381 void Axpby<T, CPUContext>( \ 388 cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \ 390 #else // CAFFE2_USE_MKL 391 #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ 393 void Axpby<T, CPUContext>( \ 400 cblas_##prefix##scal(N, beta, y, 1); \ 401 cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ 403 #endif // CAFFE2_USE_MKL 404 CAFFE2_SPECIALIZED_AXPBY(
float, s)
405 #undef CAFFE2_SPECIALIZED_AXPBY 407 #endif // CAFFE2_USE_EIGEN_FOR_BLAS 410 void GemmBatched<float, CPUContext>(
411 const CBLAS_TRANSPOSE TransA,
412 const CBLAS_TRANSPOSE TransB,
413 const int batch_size,
424 TensorProto::DataType ) {
425 const int a_stride = M * K;
426 const int b_stride = K * N;
427 const int c_stride = M * N;
429 #ifdef CAFFE2_USE_MKL 432 const int lda = (TransA == CblasNoTrans) ? K : M;
433 const int ldb = (TransB == CblasNoTrans) ? N : K;
434 std::vector<const float*> a_array(batch_size,
nullptr);
435 std::vector<const float*> b_array(batch_size,
nullptr);
436 std::vector<float*> c_array(batch_size,
nullptr);
437 for (
int i = 0; i < batch_size; ++i) {
438 a_array[i] = A + a_stride * i;
439 b_array[i] = B + b_stride * i;
440 c_array[i] = C + c_stride * i;
459 #else // CAFFE2_USE_MKL 461 for (
int i = 0; i < batch_size; ++i) {
462 math::Gemm<float, CPUContext>(
486 #ifdef CAFFE2_USE_MKL 488 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \ 490 void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \ 491 OriginalFunc(N, x, y, ##__VA_ARGS__); \ 493 DELEGATE_SIMPLE_UNARY_FUNCTION(
497 VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
498 DELEGATE_SIMPLE_UNARY_FUNCTION(
502 VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
503 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Log, vsLn)
504 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Log, vdLn)
505 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Cos, vsCos)
506 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Cos, vdCos)
507 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sin, vsSin)
508 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Sin, vdSin)
509 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Abs, vsAbs)
510 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Abs, vdAbs)
511 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sqrt, vsSqrt)
512 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Sqrt, vdSqrt)
513 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, InvSqrt, vsInvSqrt)
514 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, InvSqrt, vdInvSqrt)
515 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sqr, vsSqr)
516 DELEGATE_SIMPLE_UNARY_FUNCTION(
double, Sqr, vdSqr)
517 #undef DELEGATE_SIMPLE_UNARY_FUNCTION 519 #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc) \ 521 void SinCos<T, CPUContext>( \ 522 const int N, const T* a, T* ys, T* yc, CPUContext*) { \ 523 OriginalFunc(N, a, ys, yc); \ 525 DELEGATE_SINCOS_FUNCTION(
float, vsSinCos)
526 DELEGATE_SINCOS_FUNCTION(
double, vdSinCos)
527 #undef DELEGATE_SINCOS_FUNCTION 529 #define DELEGATE_POWX_FUNCTION(T, OriginalFunc) \ 531 void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \ 532 OriginalFunc(N, a, b, y); \ 534 DELEGATE_POWX_FUNCTION(
float, vsPowx)
535 DELEGATE_POWX_FUNCTION(
double, vdPowx)
536 #undef DELEGATE_POWX_FUNCTION 538 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc) \ 540 void Funcname<T, CPUContext>( \ 541 const int N, const T* a, const T* b, T* y, CPUContext*) { \ 542 OriginalFunc(N, a, b, y); \ 544 DELEGATE_SIMPLE_BINARY_FUNCTION(
float, Add, vsAdd)
545 DELEGATE_SIMPLE_BINARY_FUNCTION(
double, Add, vdAdd)
546 DELEGATE_SIMPLE_BINARY_FUNCTION(
float, Sub, vsSub)
547 DELEGATE_SIMPLE_BINARY_FUNCTION(
double, Sub, vdSub)
548 DELEGATE_SIMPLE_BINARY_FUNCTION(
float, Mul, vsMul)
549 DELEGATE_SIMPLE_BINARY_FUNCTION(
double, Mul, vdMul)
550 DELEGATE_SIMPLE_BINARY_FUNCTION(
float, Div, vsDiv)
551 DELEGATE_SIMPLE_BINARY_FUNCTION(
double, Div, vdDiv)
552 #undef DELEGATE_SIMPLE_BINARY_FUNCTION 554 #else // CAFFE2_USE_MKL 556 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ 558 void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \ 559 EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr(); \ 561 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Exp, exp)
562 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Log, log)
563 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Cos, cos)
564 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sin, sin)
565 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Abs, abs)
566 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sqrt, sqrt)
567 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, InvSqrt, rsqrt)
568 DELEGATE_SIMPLE_UNARY_FUNCTION(
float, Sqr, square)
569 #undef DELEGATE_SIMPLE_UNARY_FUNCTION 571 #define DELEGATE_SINCOS_FUNCTION(T) \ 573 void SinCos<T, CPUContext>( \ 574 const int N, const T* x, T* ys, T* yc, CPUContext*) { \ 575 EigenVectorMap<T>(ys, N) = ConstEigenVectorMap<T>(x, N).array().sin(); \ 576 EigenVectorMap<T>(yc, N) = ConstEigenVectorMap<T>(x, N).array().cos(); \ 578 DELEGATE_SINCOS_FUNCTION(
float)
579 DELEGATE_SINCOS_FUNCTION(
double)
580 #undef DELEGATE_SINCOS_FUNCTION 582 #define DELEGATE_POWX_FUNCTION(T) \ 584 void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \ 585 EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b); \ 587 DELEGATE_POWX_FUNCTION(
float)
588 #undef DELEGATE_POWX_FUNCTION 590 #endif // CAFFE2_USE_MKL 593 #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr) \ 595 void Funcname<T, CPUContext>( \ 596 const int N, const T* a, const T* b, T* y, \ 598 EigenVectorMap<T>(y, N) = \ 599 ConstEigenVectorMap<T>(a, N).array() expr \ 600 ConstEigenVectorMap<T>(b, N).array(); \ 603 #ifdef CAFFE2_USE_MKL 605 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \ 606 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \ 607 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) 611 #define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr) \ 612 EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr) \ 613 EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr) \ 614 EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr) 618 DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
619 DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
620 DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
621 DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
623 #undef EIGEN_SIMPLE_BINARY_FUNCTION 624 #undef DEFINE_FLOAT_BINARY_FUNCTION 633 #define CAFFE2_SPECIALIZED_REDUCEMIN(T) \ 635 void ReduceMin<T, CPUContext>( \ 639 Tensor<CPUContext>* , \ 641 *y = *std::min_element(x, x + N); \ 643 CAFFE2_SPECIALIZED_REDUCEMIN(
float)
644 #undef CAFFE2_SPECIALIZED_REDUCEMIN 646 #define CAFFE2_SPECIALIZED_REDUCEMAX(T) \ 648 void ReduceMax<T, CPUContext>( \ 652 Tensor<CPUContext>* , \ 654 *y = *std::max_element(x, x + N); \ 656 CAFFE2_SPECIALIZED_REDUCEMAX(
float)
657 CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
658 CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
660 #undef CAFFE2_SPECIALIZED_REDUCEMAX 662 #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \ 664 void RowwiseMax<T, CPUContext>( \ 665 const int N, const int D, const T* x, T* y, CPUContext*) { \ 666 EigenVectorMap<T>(y, N) = \ 667 ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \ 669 CAFFE2_SPECIALIZED_ROWWISEMAX(
float)
670 #undef CAFFE2_SPECIALIZED_ROWWISEMAX 672 #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \ 674 void ColwiseMax<T, CPUContext>( \ 675 const int N, const int D, const T* x, T* y, CPUContext*) { \ 676 EigenVectorMap<T>(y, D) = \ 677 ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \ 679 CAFFE2_SPECIALIZED_COLWISEMAX(
float)
680 #undef CAFFE2_SPECIALIZED_COLWISEMAX 682 #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \ 684 void ElemwiseMax<T, CPUContext>( \ 685 const int N, const T* x, const T* y, T* z, CPUContext* ) { \ 686 std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \ 687 return std::max(x_i, y_i); \ 690 CAFFE2_SPECIALIZED_ELEMWISEMAX(
float)
691 #undef CAFFE2_SPECIALIZED_ELEMWISEMAX 693 #define CAFFE2_SPECIALIZED_MAXIMUM(T) \ 695 void Maximum<T, CPUContext>( \ 696 const int N, const float alpha, const T* x, T* y, CPUContext* context) { \ 698 x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \ 700 CAFFE2_SPECIALIZED_MAXIMUM(
float)
701 #undef CAFFE2_SPECIALIZED_MAXIMUM 706 #define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr) \ 708 void Funcname##ToRow<T, CPUContext>( \ 709 const int M, const int N, const T* a, const T* b, T* y, CPUContext*) { \ 710 EigenArrayMap<T>(y, N, M) = ConstEigenArrayMap<T>(a, N, M).colwise() \ 711 expr ConstEigenVectorArrayMap<T>(b, N); \ 715 void Funcname##ToRow<T, CPUContext>( \ 716 const int M, const int N, const T* x, T* y, CPUContext*) { \ 717 EigenArrayMap<T>(y, N, M).colwise() expr## = \ 718 ConstEigenVectorArrayMap<T>(x, N); \ 721 void Funcname##ToCol<T, CPUContext>( \ 722 const int M, const int N, const T* x, T* y, CPUContext*) { \ 723 EigenArrayMap<T>(y, N, M).rowwise() expr## = \ 724 ConstEigenVectorArrayMap<T>(x, M).transpose(); \ 727 #define DEFINE_BROADCAST_BINARY_FUNCTION(name, op) \ 728 DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op) \ 729 DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op) \ 730 DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op) \ 732 DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
733 DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
734 DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *)
735 DEFINE_BROADCAST_BINARY_FUNCTION(Div, /)
737 #undef DEFINE_BROADCAST_BINARY_FUNCTION 738 #undef DELEGATE_BROADCAST_BINARY_FUNCTION 740 #define CAFFE2_SPECIALIZED_SET(T) \ 742 void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \ 743 if (alpha == (T)0) { \ 744 if (Y != nullptr) { \ 745 memset(Y, 0, N * sizeof(T)); \ 748 EigenVectorMap<T>(Y, N).setConstant(alpha); \ 752 CAFFE2_SPECIALIZED_SET(
float);
753 CAFFE2_SPECIALIZED_SET(
double);
754 CAFFE2_SPECIALIZED_SET(int8_t);
755 CAFFE2_SPECIALIZED_SET(int16_t);
756 CAFFE2_SPECIALIZED_SET(
int);
757 CAFFE2_SPECIALIZED_SET(int64_t);
758 CAFFE2_SPECIALIZED_SET(
bool);
759 CAFFE2_SPECIALIZED_SET(
char);
760 CAFFE2_SPECIALIZED_SET(uint8_t);
761 CAFFE2_SPECIALIZED_SET(uint16_t);
762 #undef CAFFE2_SPECIALIZED_SET 764 #define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T) \ 766 void name<T, CPUContext>( \ 767 const int n, const T* a, const T* b, bool* y, CPUContext*) { \ 768 for (int i = 0; i < n; ++i) { \ 769 y[i] = a[i] op b[i]; \ 773 void name##ToRow<T, CPUContext>( \ 780 for (int i = 0; i < n * m; ++i) { \ 781 y[i] = a[i] op b[i % n]; \ 785 #define CAFFE2_DEFINE_BINARY_OP(name, op) \ 786 CAFFE2_INSTANTIATE_BINARY_OP(name, op, float) \ 787 CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \ 788 CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t) 790 CAFFE2_DEFINE_BINARY_OP(LT, <);
791 CAFFE2_DEFINE_BINARY_OP(LE, <=);
792 CAFFE2_DEFINE_BINARY_OP(GT, >);
793 CAFFE2_DEFINE_BINARY_OP(GE, >=);
795 CAFFE2_INSTANTIATE_BINARY_OP(Or, |,
bool);
796 CAFFE2_INSTANTIATE_BINARY_OP(And, &,
bool);
797 CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^,
bool);
800 void Not<bool, CPUContext>(
805 for (
int i = 0; i < n; ++i) {
810 #undef CAFFE2_DEFINE_BINARY_OP 811 #undef CAFFE2_INSTANTIATE_BINARY_OP 813 #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \ 815 void AddStripedBatch( \ 821 CPUContext* context) { \ 822 for (int j = 0; j < batch; j++) { \ 823 Add<T, CPUContext>(N, first + j * stripe, y, y, context); \ 827 CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(
float);
828 #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH 831 void RandUniform<float, CPUContext>(
836 CPUContext* context) {
837 std::uniform_real_distribution<float> distribution(a, b);
838 for (
auto i = 0; i < n; ++i) {
839 r[i] = distribution(context->RandGenerator());
844 void RandUniform<int, CPUContext>(
849 CPUContext* context) {
850 std::uniform_int_distribution<int> distribution(a, b);
851 for (
auto i = 0; i < n; ++i) {
852 r[i] = distribution(context->RandGenerator());
856 #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ 858 void RandUniformUnique<T, CPUContext>( \ 865 CPUContext* context) { \ 867 n, b - a - m + 1, "Cannot satisfy the unique requirement"); \ 868 std::unordered_set<T> avoid_set(n); \ 870 avoid_set.insert(avoid, avoid + m); \ 871 CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \ 873 std::uniform_int_distribution<T> distribution(a, b); \ 875 for (size_t i = 0; i < n; ++i) { \ 877 v = distribution(context->RandGenerator()); \ 878 } while (avoid_set.count(v)); \ 880 avoid_set.insert(v); \ 884 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
885 CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
886 #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE 889 void RandGaussian<float, CPUContext>(
894 CPUContext* context) {
895 std::normal_distribution<float> distribution(mean, std);
896 for (
auto i = 0; i < n; ++i) {
897 r[i] = distribution(context->RandGenerator());
901 #define CAFFE2_SPECIALIZED_SUM(T) \ 903 void Sum<T, CPUContext>( \ 908 Tensor<CPUContext>* ) { \ 909 *y = ConstEigenVectorMap<T>(x, N).sum(); \ 912 CAFFE2_SPECIALIZED_SUM(
float);
913 CAFFE2_SPECIALIZED_SUM(int32_t);
914 CAFFE2_SPECIALIZED_SUM(int64_t);
916 #undef CAFFE2_SPECIALIZED_SUM 919 void SumSqr<float, CPUContext>(
924 Tensor<CPUContext>* ) {
925 *y = ConstEigenVectorMap<float>(x, N).squaredNorm();
929 void Select<float, CPUContext>(
936 for (
int i = 0; i < N; ++i) {
937 DCHECK_LT(idx[i], D);
938 y[i] = x[i * D + idx[i]];
943 void Im2colNd<float, CPUContext, StorageOrder::NCHW>(
944 const float* data_img,
946 const int* col_shape,
949 const int* kernel_shape,
956 bool accumulate_output) {
958 for (
int i = 0; i < N; ++i) {
959 kernel_size *= kernel_shape[i];
961 const int channels_col = col_shape[0];
962 vector<int> d_offset(N, 0);
963 vector<int> d_iter(N, 0);
964 for (
int c_col = 0; c_col < channels_col; ++c_col) {
967 for (
int d_i = N - 1; d_i >= 0; --d_i) {
969 offset /= kernel_shape[d_i + 1];
971 d_offset[d_i] = offset % kernel_shape[d_i];
973 for (
bool incremented =
true; incremented;) {
976 int index_col = c_col;
977 int index_im = c_col / kernel_size;
978 bool is_padding =
false;
979 for (
int d_i = 0; d_i < N; ++d_i) {
980 const int d = d_iter[d_i];
982 d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
983 is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
984 index_col *= col_shape[d_i + 1];
986 index_im *= im_shape[d_i + 1];
989 if (!accumulate_output) {
991 data_col[index_col] = 0;
993 data_col[index_col] = data_img[index_im];
995 }
else if (!is_padding) {
996 data_col[index_im] += data_img[index_col];
1000 incremented =
false;
1001 for (
int d_i = N - 1; d_i >= 0; --d_i) {
1002 const int d_max = col_shape[d_i + 1];
1003 DCHECK_LT(d_iter[d_i], d_max);
1004 if (d_iter[d_i] == d_max - 1) {
1017 void Col2imNd<float, CPUContext, StorageOrder::NCHW>(
1018 const float* data_col,
1019 const int* img_shape,
1020 const int* col_shape,
1023 const int* kernel_shape,
1025 const int* dilation,
1029 CPUContext* context) {
1030 Set<float, CPUContext>(img_size, 0, data_img, context);
1031 Im2colNd<float, CPUContext, StorageOrder::NCHW>(
1048 void Im2col<float, CPUContext, StorageOrder::NCHW>(
1049 const float* data_im,
1055 const int dilation_h,
1056 const int dilation_w,
1065 const int output_h =
1066 (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1068 const int output_w =
1069 (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1074 if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1075 pad_t == 0 && pad_b == 0) {
1076 for (
auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1077 const auto nip = k / (kernel_h * kernel_w);
1078 const auto rest = k % (kernel_h * kernel_w);
1079 const auto kh = rest / kernel_w;
1080 const auto kw = rest % kernel_w;
1081 auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
1082 kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1083 const auto* src = data_im + nip * (height * width);
1084 for (
auto y = 0; y < output_h; y++) {
1085 const auto iy = y * stride_h + kh;
1087 if (stride_w == 1) {
1089 dst + (y * output_w),
1090 src + (iy * width + ix),
1091 sizeof(
float) * output_w);
1093 for (
auto x = 0; x < output_w; x++) {
1095 dst + (y * output_w + x),
1096 src + (iy * width + ix + x * stride_w),
1106 if (pad_l == pad_r && pad_t == pad_b) {
1108 const int pad_h = pad_t;
1109 const int pad_w = pad_l;
1110 const int channel_size = height * width;
1111 for (
int channel = channels; channel--; data_im += channel_size) {
1112 for (
int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1113 for (
int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1114 int input_row = -pad_h + kernel_row * dilation_h;
1115 for (
int output_rows = output_h; output_rows; output_rows--) {
1116 if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1117 for (
int output_cols = output_w; output_cols; output_cols--) {
1121 int input_col = -pad_w + kernel_col * dilation_w;
1122 for (
int output_col = output_w; output_col; output_col--) {
1123 if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1124 *(data_col++) = data_im[input_row * width + input_col];
1128 input_col += stride_w;
1131 input_row += stride_h;
1140 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1141 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1143 int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1144 int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1146 int channels_col = channels * kernel_h * kernel_w;
1147 for (
int c = 0; c < channels_col; ++c) {
1148 int w_offset = c % kernel_w;
1149 int h_offset = (c / kernel_w) % kernel_h;
1150 int c_im = c / kernel_h / kernel_w;
1151 for (
int h = 0; h < height_col; ++h) {
1152 for (
int w = 0; w < width_col; ++w) {
1153 int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1154 int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1155 if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
1156 data_col[(c * height_col + h) * width_col + w] =
1157 data_im[(c_im * height + h_pad) * width + w_pad];
1159 data_col[(c * height_col + h) * width_col + w] = 0;
1166 void Im2col<float, CPUContext, StorageOrder::NHWC>(
1167 const float* data_im,
1173 const int dilation_h,
1174 const int dilation_w,
1183 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1184 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1186 int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1187 int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1190 for (
int h = 0; h < height_col; ++h) {
1192 for (
int w = 0; w < width_col; ++w) {
1193 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1194 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1195 if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1196 memcpy(data_col, data_im + (ih * width + iw) * channels,
1197 sizeof(
float) * channels);
1200 memset(data_col, 0,
sizeof(
float) * channels);
1202 data_col += channels;
1212 void Col2im<float, CPUContext, StorageOrder::NCHW>(
1213 const float* data_col,
1219 const int dilation_h,
1220 const int dilation_w,
1228 CPUContext* context) {
1229 const int output_h =
1230 (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
1232 const int output_w =
1233 (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
1236 Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1240 if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
1241 pad_t == 0 && pad_b == 0) {
1242 for (
auto k = 0; k < channels * kernel_h * kernel_w; k++) {
1243 const auto nip = k / (kernel_h * kernel_w);
1244 const auto rest = k % (kernel_h * kernel_w);
1245 const auto kh = rest / kernel_w;
1246 const auto kw = rest % kernel_w;
1247 const auto* dst = data_col +
1248 nip * (kernel_h * kernel_w * output_h * output_w) +
1249 kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
1250 auto* src = data_im + nip * (height * width);
1251 for (
auto y = 0; y < output_h; y++) {
1252 const auto iy = y * stride_h + kh;
1254 if (stride_w == 1) {
1255 auto offsrc = src + (iy * width + ix);
1256 const auto offdst = dst + (y * output_w);
1257 for (
auto i = 0; i < output_w; ++i) {
1258 offsrc[i] += offdst[i];
1261 for (
auto x = 0; x < output_w; x++) {
1262 auto offsrc = src + (iy * width + ix + x * stride_w);
1263 const auto offdst = dst + (y * output_w + x);
1273 if (pad_l == pad_r && pad_t == pad_b) {
1275 const int pad_h = pad_t;
1276 const int pad_w = pad_l;
1277 const int channel_size = height * width;
1278 for (
int channel = channels; channel--; data_im += channel_size) {
1279 for (
int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
1280 for (
int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
1281 int input_row = -pad_h + kernel_row * dilation_h;
1282 for (
int output_rows = output_h; output_rows; output_rows--) {
1283 if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
1284 data_col += output_w;
1286 int input_col = -pad_w + kernel_col * dilation_w;
1287 for (
int output_col = output_w; output_col; output_col--) {
1288 if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
1289 data_im[input_row * width + input_col] += *data_col;
1292 input_col += stride_w;
1295 input_row += stride_h;
1304 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1305 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1307 int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1308 int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1309 int channels_col = channels * kernel_h * kernel_w;
1310 for (
int c = 0; c < channels_col; ++c) {
1311 int w_offset = c % kernel_w;
1312 int h_offset = (c / kernel_w) % kernel_h;
1313 int c_im = c / kernel_h / kernel_w;
1314 for (
int h = 0; h < height_col; ++h) {
1315 for (
int w = 0; w < width_col; ++w) {
1316 int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
1317 int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
1318 if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
1319 data_im[(c_im * height + h_pad) * width + w_pad] +=
1320 data_col[(c * height_col + h) * width_col + w];
1328 void Col2im<float, CPUContext, StorageOrder::NHWC>(
1329 const float* data_col,
1335 const int dilation_h,
1336 const int dilation_w,
1344 CPUContext* context) {
1345 const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
1346 const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
1348 Set<float, CPUContext>(height * width * channels, 0, data_im, context);
1349 int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
1350 int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
1352 for (
int h = 0; h < height_col; ++h) {
1354 for (
int w = 0; w < width_col; ++w) {
1355 for (
int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
1356 for (
int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
1357 if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
1358 auto* data_im_patch = data_im + (ih * width + iw) * channels;
1359 Add<float, CPUContext>(
1360 channels, data_im_patch, data_col, data_im_patch, context);
1362 data_col += channels;
1372 void BiasCHW<float, CPUContext>(
1374 const int bias_channels,
1375 const int image_size,
1379 for (
int c = 0; c < bias_channels; ++c) {
1383 float32x4_t vBias = vdupq_n_f32(b);
1387 constexpr
int kVecSizeInFloat =
sizeof(float32x4_t) /
sizeof(
float);
1394 (((uintptr_t) image) % (
sizeof(float32x4_t))) /
sizeof(
float);
1398 for (; i < prologue; ++i) {
1403 constexpr
int kUnroll = 8;
1404 constexpr
int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
1406 int remainder = image_size - prologue;
1407 int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
1410 for (; i < vectorizable; i += kFloatsPerLoop) {
1412 float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
1413 float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
1414 float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
1415 float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
1416 float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
1417 float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
1418 float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
1419 float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
1421 v0 = vaddq_f32(v0, vBias);
1422 v1 = vaddq_f32(v1, vBias);
1423 v2 = vaddq_f32(v2, vBias);
1424 v3 = vaddq_f32(v3, vBias);
1425 v4 = vaddq_f32(v4, vBias);
1426 v5 = vaddq_f32(v5, vBias);
1427 v6 = vaddq_f32(v6, vBias);
1428 v7 = vaddq_f32(v7, vBias);
1430 vst1q_f32_aligned(image + i + 0, v0);
1431 vst1q_f32_aligned(image + i + 4, v1);
1432 vst1q_f32_aligned(image + i + 8, v2);
1433 vst1q_f32_aligned(image + i + 12, v3);
1434 vst1q_f32_aligned(image + i + 16, v4);
1435 vst1q_f32_aligned(image + i + 20, v5);
1436 vst1q_f32_aligned(image + i + 24, v6);
1437 vst1q_f32_aligned(image + i + 28, v7);
1441 for (; i < image_size; ++i) {
1446 for (
int i = 0; i < image_size; ++i) {
1449 #endif // __ARM_NEON__ 1451 image += image_size;
1456 void CopyMatrix<CPUContext>(
1457 const size_t itemsize,
1465 TypeMeta::TypedCopy copy) {
1466 if (A ==
nullptr || B ==
nullptr) {
1469 if (lda == N && ldb == N) {
1472 copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
1475 static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
1480 for (
int i = 0; i < M; ++i) {
1483 static_cast<const char*>(A) + lda * i * itemsize,
1484 static_cast<char*>(B) + ldb * i * itemsize,
1488 static_cast<char*>(B) + ldb * i * itemsize,
1489 static_cast<const char*>(A) + lda * i * itemsize,
1495 #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \ 1497 void CopyVector<T, CPUContext>( \ 1498 const int N, const T* src, T* dst, CPUContext* ) { \ 1499 if (src != dst && N > 0) { \ 1500 memcpy(dst, src, sizeof(T) * N); \ 1503 CAFFE2_SPECIALIZED_COPYVECTOR(
float)
1504 #undef CAFFE2_SPECIALIZED_COPYVECTOR 1508 #ifdef CAFFE2_USE_HPTT 1510 bool TryTransposeWithHPTT(
1516 std::vector<int> axes_cm(num_axes);
1517 std::vector<int> dims_cm(num_axes);
1520 const auto cm_fn = [num_axes](
const int i) {
return num_axes - i - 1; };
1521 for (
int i = 0; i < num_axes; ++i) {
1522 axes_cm[i] = cm_fn(axes[cm_fn(i)]);
1523 dims_cm[i] = dims[cm_fn(i)];
1525 auto plan = hptt::create_plan(
1537 if (plan ==
nullptr) {
1544 #endif // CAFFE2_USE_HPTT 1547 ComputeXStrides(
const int num_axes,
const int* dims,
const int* axes) {
1548 std::vector<int> x_strides(num_axes);
1549 std::vector<int> buff(num_axes);
1551 for (
int i = num_axes - 1; i >= 0; --i) {
1552 buff[i] = cur_stride;
1553 cur_stride *= dims[i];
1555 for (
int i = 0; i < num_axes; ++i) {
1556 x_strides[i] = buff[axes[i]];
1561 void IncreaseIndex(
const int* dims, std::vector<int>* index) {
1562 for (
int i = index->size() - 1; i >= 0; --i) {
1564 if (index->at(i) >= dims[i]) {
1565 index->at(i) -= dims[i];
1572 template <
typename T>
1578 const int data_size,
1583 int num_shared_idxs = 0;
1584 for (
int i = num_axes - 1; i >= 0 && axes[i] == i; --i) {
1585 block_size *= y_dims[i];
1589 if (num_axes < 2 || num_shared_idxs == num_axes) {
1590 memcpy(Y, X, data_size *
sizeof(T));
1594 const int itr_axes = num_axes - num_shared_idxs;
1595 const std::vector<int> x_strides = ComputeXStrides(itr_axes, x_dims, axes);
1596 std::vector<int> index_digits(itr_axes, 0);
1597 const int num_blocks = data_size / block_size;
1598 for (
int y_index = 0; y_index < num_blocks; ++y_index) {
1599 const int x_index = std::inner_product(
1600 x_strides.cbegin(), x_strides.cend(), index_digits.cbegin(), 0);
1601 if (block_size == 1) {
1602 Y[y_index] = X[x_index];
1605 Y + block_size * y_index,
1606 X + block_size * x_index,
1607 block_size *
sizeof(T));
1609 IncreaseIndex(y_dims, &index_digits);
1616 void Transpose<float, CPUContext>(
1621 const int data_size,
1625 #ifdef CAFFE2_USE_HPTT 1626 if (TryTransposeWithHPTT(num_axes, x_dims, axes, X, Y)) {
1629 #endif // CAFFE2_USE_HPTT 1630 TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y);
1633 #define CAFFE2_SPECIALIZED_TRANSPOSE(T) \ 1635 void Transpose<T, CPUContext>( \ 1636 const int num_axes, \ 1637 const int* x_dims, \ 1638 const int* y_dims, \ 1640 const int data_size, \ 1644 TransposeCPU(num_axes, x_dims, y_dims, axes, data_size, X, Y); \ 1646 CAFFE2_SPECIALIZED_TRANSPOSE(
double)
1647 CAFFE2_SPECIALIZED_TRANSPOSE(
int)
1648 CAFFE2_SPECIALIZED_TRANSPOSE(
long)
1649 #undef CAFFE2_SPECIALIZED_TRANSPOSE
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...