Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_eigen.cc
1 #include "caffe2/core/context.h"
2 #include "caffe2/core/operator.h"
3 #include "caffe2/operators/conv_pool_op_base.h"
4 
5 #include "Eigen/Core"
6 
7 #if !EIGEN_VERSION_AT_LEAST(3, 3, 0)
8 #error "Caffe2 requires Eigen to be at least 3.3.0.";
9 #endif
10 
11 #include "unsupported/Eigen/CXX11/Tensor"
12 
13 namespace caffe2 {
14 
15 template <typename T>
16 class EigenConvOp final : public ConvPoolOpBase<CPUContext> {
17  public:
18  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
19  EigenConvOp(const OperatorDef& operator_def, Workspace* ws)
20  : ConvPoolOpBase<CPUContext>(operator_def, ws) {
21  OPERATOR_NEEDS_FEATURE(group_ == 1, "Group convolution not supported yet.");
22  }
23  ~EigenConvOp() {}
24 
25  bool RunOnDeviceWithOrderNCHW() override;
26  bool RunOnDeviceWithOrderNHWC() override;
27 
28  private:
29  INPUT_TAGS(INPUT, FILTER, BIAS);
30 };
31 
32 // The NCHW implementation: we do explicit transposes before and after, which
33 // are not ideal but provides a compatible path instead of throwing the error.
34 template <typename T>
36  auto& X = Input(INPUT);
37  auto& filter = Input(FILTER);
38  auto* Y = Output(0);
39  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
40  CAFFE_ENFORCE(4 == filter.ndim());
41  const int M = filter.dim32(0);
42  CAFFE_ENFORCE(filter.dim32(1) == C);
43  CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
44  CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
45  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
46  Eigen::array<TIndex, 4> kernel_shuffles
47  { {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
48  Eigen::array<TIndex, 4> input_shuffles
49  { {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };
50 
51  Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
52  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
53  const_cast<T*>(filter.template data<T>()),
54  M,
55  C,
56  kernel_h(),
57  kernel_w())
58  .shuffle(kernel_shuffles);
59  Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
60  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
61  const_cast<T*>(X.template data<T>()), N, C, H, W)
62  .shuffle(input_shuffles);
63 
64  // For Eigen, the definition of row and col actually correspond to width
65  // and height instead of the other way round, so notice how we pass the
66  // stride, pad and dilation values.
67  typedef typename Eigen::internal::traits<
68  Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
69  Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
70  contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
71 
72  Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
73  pre_contract_dims[1] = kernel_h() * kernel_w() * C;
74  pre_contract_dims[0] = Y->size() / M;
75 
76  Eigen::DSizes<TensorIndex, 2> kernel_dims;
77  kernel_dims[0] = kernel_h() * kernel_w() * C;
78  kernel_dims[1] = M;
79 
80  Eigen::array<TensorIndex, 4> bcast_dims;
81  bcast_dims[0] = N;
82  bcast_dims[1] = Y->dim32(1);
83  bcast_dims[2] = Y->dim32(2);
84  bcast_dims[3] = 1;
85 
86  Eigen::Tensor<T, 4, Eigen::RowMajor> Y_tensor(
87  Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
88  Y_tensor = X_tensor
89  .extract_image_patches(
90  kernel_w(),
91  kernel_h(),
92  stride_w(),
93  stride_h(),
94  dilation_w(),
95  dilation_h(),
96  1,
97  1,
98  pad_l(),
99  pad_r(),
100  pad_t(),
101  pad_b(),
102  0)
103  .reshape(pre_contract_dims)
104  .contract(filter_tensor.reshape(kernel_dims), contract_dims)
105  .reshape(Y_tensor.dimensions());
106  if (InputSize() == 3) {
107  auto& bias = Input(BIAS);
108  CAFFE_ENFORCE(1 == bias.ndim());
109  CAFFE_ENFORCE(bias.dim32(0) == M);
110  // It seems that the bias broadcast is still slower so let's do the
111  // following for now.
112  EigenArrayMap<T> Y_arr(
113  Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
114  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
115  Y_arr = Y_arr.colwise() + bias_arr;
116  }
117 
118  // Do a last transpose.
119  Eigen::array<TIndex, 4> output_shuffles
120  { {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };
121 
122  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
123  Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
124  Y_tensor.shuffle(output_shuffles);
125  return true;
126 }
127 
128 template <typename T>
130  auto& X = Input(INPUT);
131  auto& filter = Input(FILTER);
132  auto* Y = Output(0);
133  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
134  CAFFE_ENFORCE(4 == filter.ndim());
135  const int M = filter.dim32(0);
136  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
137  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
138  CAFFE_ENFORCE(filter.dim32(3) == C);
139  ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
140  // Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
141  // optimization purposes, so we will create a temp one.
142  Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
143  M, kernel_h() * kernel_w() * C);
144  temp_filter = ConstEigenArrayMap<T>(
145  filter.template data<T>(), kernel_h() * kernel_w() * C, M)
146  .transpose();
147 
148  // Create tensor maps, and call spatial convolution.
149  // TODO(jiayq): right now we const cast away the const pointer, but we will
150  // need to figure out how to properly do a const tensormap.
151  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> X_tensor(
152  const_cast<T*>(X.template data<T>()), N, H, W, C);
153  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
154  Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
155  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
156  const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);
157 
158  // For Eigen, the definition of row and col actually correspond to width
159  // and height instead of the other way round, so notice how we pass the
160  // stride, pad and dilation values.
161  typedef typename Eigen::internal::traits<
162  Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
163  Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
164  contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);
165 
166  Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
167  pre_contract_dims[1] = kernel_h() * kernel_w() * C;
168  pre_contract_dims[0] = Y->size() / M;
169 
170  Eigen::DSizes<TensorIndex, 2> kernel_dims;
171  kernel_dims[0] = kernel_h() * kernel_w() * C;
172  kernel_dims[1] = M;
173 
174  Eigen::array<TensorIndex, 4> bcast_dims;
175  bcast_dims[0] = N;
176  bcast_dims[1] = Y->dim32(1);
177  bcast_dims[2] = Y->dim32(2);
178  bcast_dims[3] = 1;
179 
180  Y_tensor = X_tensor
181  .extract_image_patches(
182  kernel_w(),
183  kernel_h(),
184  stride_w(),
185  stride_h(),
186  dilation_w(),
187  dilation_h(),
188  1,
189  1,
190  pad_l(),
191  pad_r(),
192  pad_t(),
193  pad_b(),
194  0)
195  .reshape(pre_contract_dims)
196  .contract(filter_tensor.reshape(kernel_dims), contract_dims)
197  .reshape(Y_tensor.dimensions());
198 
199  if (InputSize() == 3) {
200  auto& bias = Input(BIAS);
201  CAFFE_ENFORCE(1 == bias.ndim());
202  CAFFE_ENFORCE(bias.dim32(0) == M);
203  Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> bias_tensor(
204  const_cast<T*>(bias.template data<T>()), 1, 1, 1, M);
205  // It seems that the bias broadcast is still slower so let's do the
206  // following for now.
207  EigenArrayMap<T> Y_arr(
208  Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
209  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
210  Y_arr = Y_arr.colwise() + bias_arr;
211  }
212  return true;
213 }
214 
215 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, EIGEN, EigenConvOp<float>);
216 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv1D, EIGEN, EigenConvOp<float>);
217 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv2D, EIGEN, EigenConvOp<float>);
218 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv3D, EIGEN, EigenConvOp<float>);
219 
220 } // namespace caffe2
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...