Caffe2 - C++ API
A deep learning, cross platform ML framework
stylizer_ops.cc
1 #include "caffe2/core/operator.h"
2 #include "caffe2/utils/cpu_neon.h"
3 #include "caffe2/utils/math.h"
4 
5 namespace caffe2 {
6 
7 #ifdef __ARM_NEON__
8 namespace {
9 
10 //
11 // ARM Neon code utilities
12 //
13 
14 inline float32x4_t to_v4_f32(uint16x4_t v) {
15  return vcvtq_f32_u32(vmovl_u16(v));
16 }
17 
18 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
19  float32x4x4_t out;
20 
21  uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
22 
23  out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
24  out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
25 
26  uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
27 
28  out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
29  out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
30 
31  return out;
32 }
33 
34 inline void clamp(float32x4_t& v) {
35  v = vmaxq_f32(v, vdupq_n_f32(0));
36  v = vminq_f32(v, vdupq_n_f32((float)std::numeric_limits<uint8_t>::max()));
37 }
38 
39 inline void addMeanAndClamp(float32x4_t& v, float mean) {
40  v = vaddq_f32(v, vdupq_n_f32(mean));
41  clamp(v);
42 }
43 
44 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
45  uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
46  uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
47  uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
48  return vmovn_u16(u16_01);
49 }
50 
51 } // unnamed namespace
52 #endif // __ARM_NEON__
53 
55  : public Operator<CPUContext> {
56  public:
57  // Expect this many channels as input
58  static constexpr int kInputChannels = 4;
59 
60  // Expect this many channels as output
61  static constexpr int kOutputChannels = 3;
62 
63  // We read this much noise per vectorized cycle
64  static constexpr int kNeonNoiseReadSize = kOutputChannels * 16;
65 
66  USE_OPERATOR_FUNCTIONS(CPUContext);
68  const OperatorDef& operator_def,
69  Workspace* ws)
70  : Operator<CPUContext>(operator_def, ws), ws_(ws) {}
71 
72  bool RunOnDevice() {
73  const auto& X = Input(0);
74  const auto& mean = Input(1);
75  auto* Y = Output(0);
76  auto* noiseBlob = ws_->CreateBlob("__CAFFE2_STYLIZER_NOISE__");
77  auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
78  "noise_size", 491 /* prime to avoid artifacts */);
79 
80  if (!noiseBlob->IsType<TensorCPU>()) {
81  // Initialize random noise on first use.
82  // Cache it to maintain temporal consistency.
83  auto* t = noiseBlob->template GetMutable<TensorCPU>();
84 
85 #ifdef __ARM_NEON__
86  // Noise space is larger for vectorized code due to the
87  // vectorized load
88  initNoiseCPUNeon(t, defaultNoiseSize);
89 #else
90  initNoiseCPU(t, defaultNoiseSize);
91 #endif
92  }
93  const auto& noise = noiseBlob->template Get<TensorCPU>();
94  CAFFE_ENFORCE(noise.size() >= defaultNoiseSize);
95 
96  CAFFE_ENFORCE(X.ndim() == 4);
97  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
98  // Assume BGR or BGRA
99  CAFFE_ENFORCE(mean.size() == kOutputChannels);
100 
101  CAFFE_ENFORCE(C == kInputChannels);
102  Y->Resize(N, kOutputChannels, H, W);
103 
104  runBatch(
105  N,
106  C,
107  H,
108  W,
109  defaultNoiseSize,
110  X.data<uint8_t>(),
111  mean.data<float>(),
112  noise.data<float>(),
113  Y->mutable_data<float>());
114 
115  return true;
116  }
117 
118 #ifndef __ARM_NEON__
119  void initNoiseCPU(Tensor<CPUContext>* noise, int size) {
120  noise->Resize(size);
121 
122  math::RandGaussian<float, CPUContext>(
123  size,
124  0.0,
125  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
126  noise->template mutable_data<float>(),
127  &context_);
128  }
129 #endif // !__ARM_NEON__
130 
131 #ifdef __ARM_NEON__
132  void initNoiseCPUNeon(Tensor<CPUContext>* noise, int size) {
133  // For ARM NEON, we read in multiples of kNeonNoiseReadSize since
134  // the inner loop is vectorized. Round up to the next highest
135  // multiple of kNeonNoiseReadSize
136  size = math::roundUp(size, kNeonNoiseReadSize) + size;
137  noise->Resize(size);
138 
139  math::RandGaussian<float, CPUContext>(
140  size,
141  0.0,
142  OperatorBase::GetSingleArgument<float>("noise_std", 10.0),
143  noise->template mutable_data<float>(),
144  &context_);
145  }
146 #endif // __ARM_NEON
147 
148  void runBatch(
149  int N,
150  int /*C*/,
151  int H,
152  int W,
153  int noiseCycle,
154  const uint8_t* input,
155  const float* meanChannel,
156  const float* noise,
157  float* output) {
158  int planeSize = H * W;
159 
160  for (int n = 0; n < N; ++n) {
161  auto curInput = input + n * kInputChannels * planeSize;
162  auto curOutput = output + n * kOutputChannels * planeSize;
163 
164 #ifdef __ARM_NEON__
165  runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
166 #else
167  runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
168 #endif // __ARM_NEON__
169  }
170  }
171 
172 #ifndef __ARM_NEON__
173  void runCPU(
174  int H,
175  int W,
176  int noiseCycle,
177  const uint8_t* input,
178  const float* meanChannel,
179  const float* noise,
180  float* output) {
181  int planeSize = H * W;
182  int noiseOffset = 0;
183 
184  for (int point = 0; point < planeSize; ++point) {
185  for (int c = 0; c < kOutputChannels; ++c) {
186  float v = (float)input[point * kInputChannels + c];
187  output[c * planeSize + point] = v - meanChannel[c] + noise[noiseOffset];
188 
189  if (++noiseOffset >= noiseCycle) {
190  noiseOffset = 0;
191  }
192  }
193  }
194  }
195 #endif // !__ARM_NEON__
196 
197 #ifdef __ARM_NEON__
198  void runCPUNeon(
199  int H,
200  int W,
201  int noiseCycle,
202  const uint8_t* input,
203  const float* meanChannel,
204  const float* noise,
205  float* output) {
206  // Vectorized load parameters:
207 
208  // Loop unroll factor
209  // FIXME: this doesn't actually unroll; clang has per-loop unroll
210  // pragmas but GCC does not
211  constexpr int kUnroll = 1;
212 
213  // How much data we load for each inner loop
214  constexpr int kInnerLoadSize = sizeof(uint8x16x4_t);
215 
216  // What we write out
217  constexpr int kInnerStoreSize = sizeof(float32x4_t);
218 
219  // We load 16 pixels at a time, with 4 channels each
220  constexpr int kLoadPixels = kInnerLoadSize / kInputChannels;
221  static_assert(kLoadPixels == 16, "unexpected");
222 
223  // How many pixels we load per loop
224  constexpr int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
225 
226  // We need at least this much noise each loop through
227  CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
228 
229  int noiseUsed = 0;
230  const float* curNoise = noise;
231 
232  float mean[kOutputChannels] = {
233  meanChannel[0], meanChannel[1], meanChannel[2]};
234  int planeSize = H * W;
235 
236  // Vectorized portion
237  int point = 0;
238 
239  // If the slice is not aligned, then we have to use the
240  // un-vectorized version
241  bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
242  isPointerAligned(output, kInnerStoreSize) &&
243  // Because we are writing to output at offsets of planeSize,
244  // planeSize has to be an even multiple of kInnerStoreSize
245  (planeSize % kInnerStoreSize == 0);
246 
247  // What portion the vectorized loop will handle
248  int limit =
249  isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
250 
251  for (; point < limit; point += kLoadPixelsPerLoop) {
252  // Unroll load/update/store by kUnroll
253  for (int j = 0; j < kUnroll; ++j) {
254  // We load 16 pixels x 4 channels at a time
255  const uint8_t* inputAligned = (const uint8_t*)__builtin_assume_aligned(
256  input + (point + j * kLoadPixels) * kInputChannels,
257  sizeof(uint8x16x4_t));
258  uint8x16x4_t loadV = vld4q_u8(inputAligned);
259 
260  // The compiler doesn't want to unroll this when we put it in a
261  // loop, and in GCC there's no per-loop unroll pragma, so we do
262  // it manually.
263  // This seems to involve no register spillage, crossing fingers
264  // that it remains that way.
265  {
266  constexpr int kChannel = 0;
267  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
268  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
269  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
270  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
271 
272  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
273  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
274  outV.val[0] = vsubq_f32(outV.val[0], meanV);
275  outV.val[1] = vsubq_f32(outV.val[1], meanV);
276  outV.val[2] = vsubq_f32(outV.val[2], meanV);
277  outV.val[3] = vsubq_f32(outV.val[3], meanV);
278 
279  outV.val[0] = vaddq_f32(outV.val[0], noise0);
280  outV.val[1] = vaddq_f32(outV.val[1], noise1);
281  outV.val[2] = vaddq_f32(outV.val[2], noise2);
282  outV.val[3] = vaddq_f32(outV.val[3], noise3);
283 
284  float* outputAligned = (float*)__builtin_assume_aligned(
285  &output[kChannel * planeSize + (point + j * kLoadPixels)],
286  sizeof(float32x4_t));
287 
288  vst1q_f32(outputAligned + 0, outV.val[0]);
289  vst1q_f32(outputAligned + 4, outV.val[1]);
290  vst1q_f32(outputAligned + 8, outV.val[2]);
291  vst1q_f32(outputAligned + 12, outV.val[3]);
292  }
293 
294  {
295  constexpr int kChannel = 1;
296  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
297  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
298  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
299  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
300 
301  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
302  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
303  outV.val[0] = vsubq_f32(outV.val[0], meanV);
304  outV.val[1] = vsubq_f32(outV.val[1], meanV);
305  outV.val[2] = vsubq_f32(outV.val[2], meanV);
306  outV.val[3] = vsubq_f32(outV.val[3], meanV);
307 
308  outV.val[0] = vaddq_f32(outV.val[0], noise0);
309  outV.val[1] = vaddq_f32(outV.val[1], noise1);
310  outV.val[2] = vaddq_f32(outV.val[2], noise2);
311  outV.val[3] = vaddq_f32(outV.val[3], noise3);
312 
313  float* outputAligned = (float*)__builtin_assume_aligned(
314  &output[kChannel * planeSize + (point + j * kLoadPixels)],
315  sizeof(float32x4_t));
316 
317  vst1q_f32(outputAligned + 0, outV.val[0]);
318  vst1q_f32(outputAligned + 4, outV.val[1]);
319  vst1q_f32(outputAligned + 8, outV.val[2]);
320  vst1q_f32(outputAligned + 12, outV.val[3]);
321  }
322 
323  {
324  constexpr int kChannel = 2;
325  float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
326  float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
327  float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
328  float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
329 
330  float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
331  float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
332  outV.val[0] = vsubq_f32(outV.val[0], meanV);
333  outV.val[1] = vsubq_f32(outV.val[1], meanV);
334  outV.val[2] = vsubq_f32(outV.val[2], meanV);
335  outV.val[3] = vsubq_f32(outV.val[3], meanV);
336 
337  outV.val[0] = vaddq_f32(outV.val[0], noise0);
338  outV.val[1] = vaddq_f32(outV.val[1], noise1);
339  outV.val[2] = vaddq_f32(outV.val[2], noise2);
340  outV.val[3] = vaddq_f32(outV.val[3], noise3);
341 
342  float* outputAligned = (float*)__builtin_assume_aligned(
343  &output[kChannel * planeSize + (point + j * kLoadPixels)],
344  sizeof(float32x4_t));
345 
346  vst1q_f32(outputAligned + 0, outV.val[0]);
347  vst1q_f32(outputAligned + 4, outV.val[1]);
348  vst1q_f32(outputAligned + 8, outV.val[2]);
349  vst1q_f32(outputAligned + 12, outV.val[3]);
350  }
351  }
352 
353  curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
354  noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
355 
356  if (noiseUsed >= noiseCycle) {
357  noiseUsed = 0;
358  curNoise = noise + ((curNoise - noise) % noiseCycle);
359  }
360  }
361 
362  // Epilogue: non-vectorized remainder
363  for (; point < planeSize; ++point) {
364  for (int c = 0; c < kOutputChannels; ++c) {
365  float v = (float)input[point * kInputChannels + c];
366  output[c * planeSize + point] = v - mean[c] + *curNoise++;
367  ++noiseUsed;
368  }
369 
370  if (noiseUsed >= noiseCycle) {
371  noiseUsed = 0;
372  curNoise = noise + ((curNoise - noise) % noiseCycle);
373  }
374  }
375  }
376 #endif // __ARM_NEON__
377 
378  private:
379  Workspace* ws_;
380 };
381 
382 namespace {
383 
384 template <typename T>
385 static inline T clamped_cast(float f) {
386  if (f >= std::numeric_limits<T>::max()) {
387  return std::numeric_limits<T>::max();
388  }
389  if (f <= std::numeric_limits<T>::min()) {
390  return std::numeric_limits<T>::min();
391  }
392  return static_cast<T>(f);
393 }
394 
395 } // unnamed namespace
396 
398  : public Operator<CPUContext> {
399  public:
401 
402  // Expect this many channels as input
403  static constexpr int kInputChannels = 3;
404 
405  // Expect this many channels as output
406  static constexpr int kOutputChannels = 4;
407 
408  bool RunOnDevice() {
409  const auto& X = Input(0);
410  const auto& mean = Input(1);
411  auto* Y = Output(0);
412  CAFFE_ENFORCE(X.ndim() == 4);
413  const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
414  // Assume BGR or BGRA
415  CAFFE_ENFORCE(mean.size() == kInputChannels);
416  CAFFE_ENFORCE(C == kInputChannels);
417  // RGB
418  Y->Resize(N, H, W, kOutputChannels);
419 
420  runBatch(
421  N,
422  C,
423  H,
424  W,
425  X.data<float>(),
426  mean.data<float>(),
427  Y->mutable_data<uint8_t>());
428 
429  return true;
430  }
431 
432  void runBatch(
433  int N,
434  int /*C*/,
435  int H,
436  int W,
437  const float* input,
438  const float* meanChannel,
439  uint8_t* output) {
440  int planeSize = H * W;
441 
442  for (int n = 0; n < N; ++n) {
443  auto curInput = input + n * kInputChannels * planeSize;
444  auto curOutput = output + n * kOutputChannels * planeSize;
445 
446 #ifdef __ARM_NEON__
447  runCPUNeon(H, W, curInput, meanChannel, curOutput);
448 #else
449  runCPU(H, W, curInput, meanChannel, curOutput);
450 #endif // __ARM_NEON__
451  }
452  }
453 
454 #ifndef __ARM_NEON__
455  void runCPU(
456  int H,
457  int W,
458  const float* input,
459  const float* meanChannel,
460  uint8_t* output) {
461  int planeSize = H * W;
462 
463  for (int point = 0; point < planeSize; ++point) {
464  for (int c = 0; c < kInputChannels; ++c) {
465  uint8_t v = clamped_cast<uint8_t>(
466  input[c * planeSize + point] + meanChannel[c]);
467  output[point * kOutputChannels + c] = v;
468  }
469 
470  // alpha
471  output[point * kOutputChannels + (kOutputChannels - 1)] =
472  std::numeric_limits<uint8_t>::max();
473  }
474  }
475 #endif // !__ARM_NEON__
476 
477 #ifdef __ARM_NEON__
478  void runCPUNeon(
479  int H,
480  int W,
481  const float* input,
482  const float* meanChannel,
483  uint8_t* output) {
484  // Vectorized load parameters:
485 
486  // We load in chunks of this size
487  constexpr int kLoadUnit = sizeof(float32x4_t);
488  constexpr int kLoadFloats = (sizeof(float32x4_t) / sizeof(float));
489 
490  // We store in chunks of this size
491  constexpr int kStoreUnit = sizeof(uint8x8x4_t);
492 
493  // The vector portion loads this many f32 pixels at a time (8)
494  constexpr int kLoadPixels = 2 * kLoadFloats;
495 
496  float mean[kInputChannels] = {
497  meanChannel[0], meanChannel[1], meanChannel[2]};
498  int planeSize = H * W;
499 
500  // Vectorized portion
501  int point = 0;
502 
503  // If the slice is not aligned, then we have to use the
504  // un-vectorized version
505  bool isAligned = isPointerAligned(input, kLoadUnit) &&
506  isPointerAligned(output, kStoreUnit) &&
507  // Because we are reading from input at offsets of planeSize,
508  // planeSize has to be an even multiple of kLoadUnit
509  (planeSize % kLoadUnit == 0);
510 
511  // What portion the vectorized loop will handle
512  int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
513 
514  for (; point < limit; point += kLoadPixels) {
515  // Load 8 f32 pixels from each channel; loading 16 involves
516  // register spills it seems
517  float32x4_t inputc0_0 =
518  vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
519  float32x4_t inputc0_1 =
520  vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
521 
522  float32x4_t inputc1_0 =
523  vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
524  float32x4_t inputc1_1 =
525  vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
526 
527  float32x4_t inputc2_0 =
528  vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
529  float32x4_t inputc2_1 =
530  vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
531 
532  addMeanAndClamp(inputc0_0, mean[0]);
533  addMeanAndClamp(inputc0_1, mean[0]);
534  uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
535 
536  addMeanAndClamp(inputc1_0, mean[1]);
537  addMeanAndClamp(inputc1_1, mean[1]);
538  uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
539 
540  addMeanAndClamp(inputc2_0, mean[2]);
541  addMeanAndClamp(inputc2_1, mean[2]);
542  uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
543 
544  // This is the alpha channel
545  uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
546 
547  // We now have 8 bytes of each channel in a separate vector
548  // Write BGRA interleaved to output
549  uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
550  vst4_u8_aligned(output + kOutputChannels * point, u8_out);
551  }
552 
553  // Epilogue: non-vectorized remainder
554  for (; point < planeSize; ++point) {
555  for (int c = 0; c < kInputChannels; ++c) {
556  uint8_t v =
557  clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
558  output[point * kOutputChannels + c] = v;
559  }
560 
561  // alpha
562  output[point * kOutputChannels + (kOutputChannels - 1)] =
563  std::numeric_limits<uint8_t>::max();
564  }
565  }
566 #endif // __ARM_NEON__
567 };
568 
569 namespace {
570 
571 REGISTER_CPU_OPERATOR(
572  PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
574 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
575  .NumInputs(2)
576  .NumOutputs(1);
577 REGISTER_CPU_OPERATOR(
578  BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
580 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
581  .NumInputs(2)
582  .NumOutputs(1);
583 } // namespace
584 } // namespace caffe2
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
Definition: workspace.cc:104
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:66
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:288
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...