Caffe2 - C++ API
A deep learning, cross platform ML framework
common.h
1 // Common utilities for writing performance kernels and easy dispatching of
2 // different backends.
3 /*
4 The general workflow shall be as follows, say we want to
5 implement a functionality called void foo(int a, float b).
6 
7 In foo.h, do:
8  void foo(int a, float b);
9 
10 In foo_avx2.cc, do:
11  void foo__avx2(int a, float b) {
12  [actual avx2 implementation]
13  }
14 
15 In foo_avx.cc, do:
16  void foo__avx(int a, float b) {
17  [actual avx implementation]
18  }
19 
20 In foo.cc, do:
21  // The base implementation should *always* be provided.
22  void foo__base(int a, float b) {
23  [base, possibly slow implementation]
24  }
25  void foo(int a, float b) {
26  // You should always order things by their preference, faster
27  // implementations earlier in the function.
28  AVX2_DO(foo, a, b);
29  AVX_DO(foo, a, b);
30  BASE_DO(foo, a, b);
31  }
32 
33 */
34 // Details: this functionality basically covers the cases for both build time
35 // and run time architecture support.
36 //
37 // During build time:
38 // The build system should provide flags CAFFE2_PERF_WITH_AVX2 and
39 // CAFFE2_PERF_WITH_AVX that corresponds to the __AVX__ and __AVX2__ flags
40 // the compiler provides. Note that we do not use the compiler flags but
41 // rely on the build system flags, because the common files (like foo.cc
42 // above) will always be built without __AVX__ and __AVX2__.
43 // During run time:
44 // we use cpuid to identify cpu support and run the proper functions.
45 
46 #pragma once
47 
48 // DO macros: these should be used in your entry function, similar to foo()
49 // above, that routes implementations based on CPU capability.
50 
51 #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
52 
53 #ifdef CAFFE2_PERF_WITH_AVX2
54 #define AVX2_DO(funcname, ...) \
55  decltype(funcname##__base) funcname##__avx2; \
56  if (GetCpuId().avx2()) { \
57  return funcname##__avx2(__VA_ARGS__); \
58  }
59 #define AVX2_FMA_DO(funcname, ...) \
60  decltype(funcname##__base) funcname##__avx2_fma; \
61  if (GetCpuId().avx2() && GetCpuId().fma()) { \
62  return funcname##__avx2_fma(__VA_ARGS__); \
63  }
64 #else // CAFFE2_PERF_WITH_AVX2
65 #define AVX2_DO(funcname, ...)
66 #define AVX2_FMA_DO(funcname, ...)
67 #endif // CAFFE2_PERF_WITH_AVX2
68 
69 #ifdef CAFFE2_PERF_WITH_AVX
70 #define AVX_DO(funcname, ...) \
71  decltype(funcname##__base) funcname##__avx; \
72  if (GetCpuId().avx()) { \
73  return funcname##__avx(__VA_ARGS__); \
74  }
75 #define AVX_F16C_DO(funcname, ...) \
76  decltype(funcname##__base) funcname##__avx_f16c; \
77  if (GetCpuId().avx() && GetCpuId().f16c()) { \
78  return funcname##__avx_f16c(__VA_ARGS__); \
79  }
80 #else // CAFFE2_PERF_WITH_AVX
81 #define AVX_DO(funcname, ...)
82 #define AVX_F16C_DO(funcname, ...)
83 #endif // CAFFE2_PERF_WITH_AVX