Caffe2 - C++ API
A deep learning, cross platform ML framework
ulp.cc
1 #include "ulp.h"
2 #include "caffe2/operators/conv_pool_op_base.h"
3 #include "ulp_neon.h"
4 
5 namespace caffe2 {
6 
7 void uniformQuantize2b1b(const TensorCPU& X,
8  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
9  float offset,
10  float inter_center_distance) {
11  CAFFE_ENFORCE_GT(X.ndim(), 1);
12  const auto N = X.size_to_dim(X.ndim() - 1);
13  auto C = X.size() / N;
14  const auto QC = divRoundUp(C, 8);
15  auto XQs = X.dims();
16  XQs[X.ndim() - 1] = QC;
17  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
18  for (auto i = 0; i < k2b1bXBits; ++i) {
19  XQ[i]->Resize(XQs);
20  }
21  const float* Xdata = X.data<float>();
22  std::array<uint8_t*, k2b1bXBits> XQdata;
23  for (auto i = 0; i < k2b1bXBits; ++i) {
24  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
25  }
26  for (auto n = 0; n < N; ++n) {
27  for (auto qc = 0; qc < QC; ++qc) {
28  // compute the block in X.
29  std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
30  for (auto b = 0; b < 8; ++b) {
31  const auto c = qc * 8 + b;
32  if (c < C) {
33  float v = Xdata[qc * 8 + b + C * n];
34  if (v < offset) {
35  // zero'd already.
36  } else if (v < offset + inter_center_distance) {
37  p[0] |= 1 << b;
38  } else if (v < offset + 2 * inter_center_distance) {
39  p[1] |= 1 << b;
40  } else {
41  p[0] |= 1 << b;
42  p[1] |= 1 << b;
43  }
44  }
45  }
46  for (auto i = 0; i < k2b1bXBits; ++i) {
47  XQdata[i][qc + QC * n] = p[i];
48  }
49  }
50  }
51 }
52 
53 void qconv(const ConvArgs& args,
54  const TensorCPU& X,
55  const TensorCPU& W,
56  const TensorCPU* b,
57  TensorCPU* Y) {
58  const auto N = X.dim32(0);
59  const auto IH = X.dim32(1);
60  const auto IW = X.dim32(2);
61  const auto KH = W.dim32(1);
62  const auto KW = W.dim32(2);
63  const auto KC = W.dim32(3);
64  Y->Resize(X.dim32(0),
65  (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
66  (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
67  W.dim32(0));
68  const auto OH = Y->dim32(1);
69  const auto OW = Y->dim32(2);
70  const auto OC = Y->dim32(3);
71 
72  CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3));
73 
74  const auto* Xdata = X.data<uint8_t>();
75  const auto* Wdata = W.data<uint8_t>();
76  auto* Ydata = Y->mutable_data<float>();
77  for (size_t n = 0; n < N; ++n) {
78  for (size_t oh = 0; oh < OH; ++oh) {
79  for (size_t ow = 0; ow < OW; ++ow) {
80  for (size_t oc = 0; oc < OC; ++oc) {
81  float acc = 0.0;
82  for (size_t kh = 0; kh < KH; ++kh) {
83  const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t;
84  for (size_t kw = 0; kw < KW; ++kw) {
85  const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l;
86  for (size_t kc = 0; kc < KC; ++kc) {
87  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
88  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
89  if ((size_t)ih >= (size_t)IH || (size_t)iw >= (size_t)IW) {
90  acc += __builtin_popcount(0 ^ w);
91  } else {
92  const uint8_t x =
93  Xdata[kc + KC * (size_t)iw + KC * IW * (size_t)ih + n * KC * IW * IH];
94  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
95  acc += __builtin_popcount(x ^ w);
96  }
97  }
98  }
99  }
100  Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] =
101  KW * KH * KC * 8 - 2 * acc + (b ? b->data<float>()[oc] : 0.0);
102  ;
103  }
104  }
105  }
106  }
107 }
108 
109 void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
110  CAFFE_ENFORCE_EQ(args.stride_h, 1);
111  CAFFE_ENFORCE_EQ(args.stride_w, 1);
112  const auto* Xdata = X.data<uint8_t>();
113  Y->Resize(X.dim32(0),
114  X.dim32(1) + args.pad_t + args.pad_b,
115  X.dim32(2) + args.pad_l + args.pad_r,
116  X.dim32(3));
117  auto* Ydata = Y->mutable_data<uint8_t>();
118  ::memset(Ydata, Y->nbytes(), 0);
119  const auto C = Y->dim32(3);
120  const auto XrowSize = X.dim32(3) * X.dim32(2);
121  const auto YrowSize = Y->dim32(3) * Y->dim32(2);
122  math::CopyMatrix<CPUContext>(1,
123  X.dim32(1),
124  XrowSize,
125  Xdata,
126  XrowSize,
127  Ydata + C * args.pad_l + YrowSize * args.pad_t,
128  YrowSize,
129  nullptr);
130 }
131 
132 void signQuantize(const TensorCPU& X, TensorCPU* XQ) {
133  CAFFE_ENFORCE_GT(X.ndim(), 1);
134  const auto N = X.size_to_dim(X.ndim() - 1);
135  auto C = X.size() / N;
136  const auto QC = divRoundUp(C, 8);
137  auto XQs = X.dims();
138  XQs[X.ndim() - 1] = QC;
139  XQ->Resize(XQs);
140  const float* Xdata = X.data<float>();
141  uint8_t* XQdata = XQ->mutable_data<uint8_t>();
142  for (auto n = 0; n < N; ++n) {
143  for (auto qc = 0; qc < QC; ++qc) {
144  // compute the block in X.
145  uint8_t p = 0;
146  for (auto b = 0; b < 8; ++b) {
147  const auto c = qc * 8 + b;
148  if (c < C) {
149  p |= (Xdata[c + C * n] > 0) << b;
150  }
151  }
152  XQdata[qc + QC * n] = p;
153  }
154  }
155 }
156 
157 void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN) {
158  const auto F = WQ.dim32(0);
159  // In our NEON kernel we read up to TileSize, so align allocation to TileSize elements.
160  WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize);
161  const auto WQs = WQ.size() / F;
162  const auto WQbits = 8 * WQs;
163  const auto* WQdata = WQ.data<uint8_t>();
164  auto* WQNdata = WQN->mutable_data<float>();
165  for (auto f = 0; f < F; ++f) {
166  int32_t bitSum = 0;
167  for (auto j = 0; j < WQs; ++j) {
168  bitSum += __builtin_popcount(WQdata[f * WQs + j]);
169  }
170  DCHECK_LE(bitSum, WQbits);
171  WQNdata[f] = 2 * bitSum - WQbits;
172  }
173 }
174 
175 void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1) {
176  const auto F = W.dim32(0);
177  WL1->Resize(F);
178  const auto Ws = W.size() / F;
179  const auto* Wdata = W.data<float>();
180  auto* WL1data = WL1->mutable_data<float>();
181  for (auto f = 0; f < F; ++f) {
182  double l1sum = 0.0;
183  for (auto j = 0; j < Ws; ++j) {
184  l1sum += std::abs(Wdata[f * Ws + j]);
185  }
186  WL1data[f] = l1sum / Ws;
187  }
188 }
189 
190 void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol) {
191  // TODO: pass pre-resized output?
192  // TODO: handle strides?
193 
194  CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3));
195  const size_t N = XQ.dim32(0);
196  const size_t IH = XQ.dim32(1);
197  const size_t IW = XQ.dim32(2);
198  const size_t KH = WQ.dim32(1);
199  const size_t KW = WQ.dim32(2);
200  const size_t KC = WQ.dim32(3);
201 
202  XQcol->Resize(XQ.dim32(0),
203  (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
204  (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
205  KH * KW * KC);
206 
207  if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 &&
208  args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) {
209  CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size());
210  XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size());
211  return;
212  }
213  const size_t OH = XQcol->dim32(1);
214  const size_t OW = XQcol->dim32(2);
215 
216  const uint8_t* XQdata = XQ.data<uint8_t>();
217  uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>();
218  for (size_t n = 0; n < N; ++n) {
219  for (size_t oh = 0; oh < OH; ++oh) {
220  int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t;
221  for (size_t ow = 0; ow < OW; ++ow) {
222  int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l;
223  for (size_t kh = 0; kh < KH; ++kh) {
224  int32_t ih = (int32_t)kh + h_pad;
225  if ((size_t)ih < (size_t)IH && (size_t)w_pad < (size_t)IW &&
226  (size_t)((int32_t)w_pad + (int32_t)KW) < (size_t)IW) {
227  // We can do a larger memcpy, of size KW * KC
228  size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
229  n * KH * KW * KC * OW * OH;
230  std::memcpy(&XQcoldata[off],
231  &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH],
232  KW * KC);
233  } else {
234  for (size_t kw = 0; kw < KW; ++kw) {
235  int32_t iw = (int32_t)kw + w_pad;
236  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
237  size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
238  n * KH * KW * KC * OW * OH;
239  if ((size_t)ih < (size_t)IH && (size_t)iw < (size_t)IW) {
240  std::memcpy(
241  &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC);
242  } else {
243  // This should be simply padded with zero.
244  std::memset(&XQcoldata[off], 0, KC);
245  }
246  }
247  }
248  }
249  }
250  }
251  }
252 }
253 
254 std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
255  const TensorCPU& W,
256  const TensorCPU* b) {
257  auto state = caffe2::make_unique<QConvState>();
258  state->XQs.resize(k2b1bXBits);
259  state->YQs.resize(k2b1bXBits);
260  for (auto i = 0; i < k2b1bXBits; ++i) {
261  state->XQs[i] = caffe2::make_unique<TensorCPU>();
262  state->YQs[i] = caffe2::make_unique<TensorCPU>();
263  }
264  state->WQ = caffe2::make_unique<TensorCPU>();
265  state->WQN = caffe2::make_unique<TensorCPU>();
266  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
267  state->scratch = caffe2::make_unique<TensorCPU>();
268  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
269 
270  signQuantize(W, state->WQ.get());
271  filterNormalization11(*(state->WQ), state->WQN.get());
272  filterNormalizationL1(W, state->WQL1Norm.get());
273  // TODO: incorporate center distance normalization.
274  // Since inputs to convs are [0, 1, 2, 3], instead of [0, x, 2 * x, ...],
275  // we can just uniformly rescale the outputs by x, i.e.,
276  // for (auto i = 0; i < r->WQL1Norm.size(); ++i) {
277  // r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
278  // }
279  state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
280 #if CAFFE2_MOBILE
281  ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
282 #else
283  for (size_t v = 0; v < range; ++v) {
284  f(v);
285  }
286 #endif
287  };
288  if (b) {
289  state->bias = caffe2::make_unique<TensorCPU>(*b);
290  }
291  return state;
292 }
293 
294 void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
295 #ifdef __ARM_NEON__
296  if (run2b1bConvNeon(state, args, X, Y)) {
297  return;
298  }
299 #endif
300  uniformQuantize2b1b(X, state->XQs, 0.5, 1.0);
301  for (auto i = 0; i < k2b1bXBits; ++i) {
302  qconv(args, *(state->XQs[i]), *(state->WQ), nullptr, state->YQs[i].get());
303  }
304  Y->ResizeLike(*(state->YQs[0]));
305  const auto F = state->WQ->dim(0);
306  const auto N = Y->size() / F;
307  run2b1bUnification(state,
308  N,
309  F,
310  state->WQN->data<float>(),
311  state->YQs[0]->data<float>(),
312  state->YQs[1]->data<float>(),
313  F,
314  Y->mutable_data<float>(),
315  F,
316  state->bias ? state->bias->data<float>() : nullptr);
317 }
318 
319 void run2b1bUnification(QConvState* state,
320  size_t N,
321  size_t C,
322  const float* WQNVdata,
323  const float* YQs0Vdata,
324  const float* YQs1Vdata,
325  size_t YQstride,
326  float* Ydata,
327  size_t Ystride,
328  const float* bias) {
329  ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C);
330 
331  for (size_t j = 0; j < N; ++j) {
332  ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C);
333  ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C);
334  EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C);
335  if (bias) {
336  ConstEigenVectorArrayMap<float> BV(bias, C);
337  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
338  std::pow<float>(2, 0) * YQs1V + BV;
339  } else {
340  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
341  std::pow<float>(2, 0) * YQs1V;
342  }
343  }
344 }
345 
346 class QConvOp final : public ConvPoolOpBase<CPUContext> {
347  public:
348  QConvOp(const OperatorDef& operator_def, Workspace* ws)
349  : ConvPoolOpBase<CPUContext>(operator_def, ws), ws_(ws) {
350  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC, "QConvOp only supports NHWC order");
351  OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1, "");
352  OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1, "");
353  OPERATOR_NEEDS_FEATURE(this->group_ == 1, "");
354  }
355 
356  bool RunOnDeviceWithOrderNHWC() override {
357  auto& X = Input(0);
358  auto& filter = Input(1);
359  const auto* bias = InputSize() == 3 ? &Input(2) : nullptr;
360  auto* Y = Output(0);
361 
362  // TODO: Support multiple quantization methods instead of assuming 2b1b.
363  if (!state_) {
364  state_ = create2b1bConvState(ws_, filter, bias);
365  }
366  ConvArgs args;
367  args.pad_l = this->pad_l();
368  args.pad_t = this->pad_t();
369  args.pad_b = this->pad_b();
370  args.pad_r = this->pad_r();
371  args.stride_h = this->stride_h();
372  args.stride_w = this->stride_w();
373  run2b1bConvGeneric(state_.get(), args, X, Y);
374  return true;
375  }
376 
377  private:
378  std::unique_ptr<QConvState> state_;
379  Workspace* ws_;
380 };
381 
382 REGISTER_CPU_OPERATOR(QConv, QConvOp);
383 
384 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...