1 #include "caffe2/core/operator.h" 2 #include "caffe2/utils/cpu_neon.h" 3 #include "caffe2/utils/math.h" 14 inline float32x4_t to_v4_f32(uint16x4_t v) {
15 return vcvtq_f32_u32(vmovl_u16(v));
18 inline float32x4x4_t to_f32_v4_x4(uint8x16_t v) {
21 uint16x8_t lo_u16 = vmovl_u8(vget_low_u8(v));
23 out.val[0] = to_v4_f32(vget_low_u16(lo_u16));
24 out.val[1] = to_v4_f32(vget_high_u16(lo_u16));
26 uint16x8_t hi_u16 = vmovl_u8(vget_high_u8(v));
28 out.val[2] = to_v4_f32(vget_low_u16(hi_u16));
29 out.val[3] = to_v4_f32(vget_high_u16(hi_u16));
34 inline void clamp(float32x4_t& v) {
35 v = vmaxq_f32(v, vdupq_n_f32(0));
36 v = vminq_f32(v, vdupq_n_f32((
float)std::numeric_limits<uint8_t>::max()));
39 inline void addMeanAndClamp(float32x4_t& v,
float mean) {
40 v = vaddq_f32(v, vdupq_n_f32(mean));
44 inline uint8x8_t convertNarrowAndPack(float32x4_t v0, float32x4_t v1) {
45 uint16x4_t u16_0 = vmovn_u32(vcvtq_u32_f32(v0));
46 uint16x4_t u16_1 = vmovn_u32(vcvtq_u32_f32(v1));
47 uint16x8_t u16_01 = vcombine_u16(u16_0, u16_1);
48 return vmovn_u16(u16_01);
52 #endif // __ARM_NEON__ 58 static constexpr
int kInputChannels = 4;
61 static constexpr
int kOutputChannels = 3;
64 static constexpr
int kNeonNoiseReadSize = kOutputChannels * 16;
68 const OperatorDef& operator_def,
73 const auto& X = Input(0);
74 const auto& mean = Input(1);
76 auto* noiseBlob = ws_->
CreateBlob(
"__CAFFE2_STYLIZER_NOISE__");
77 auto defaultNoiseSize = OperatorBase::GetSingleArgument<int>(
83 auto* t = noiseBlob->template GetMutable<TensorCPU>();
88 initNoiseCPUNeon(t, defaultNoiseSize);
90 initNoiseCPU(t, defaultNoiseSize);
93 const auto& noise = noiseBlob->template Get<TensorCPU>();
94 CAFFE_ENFORCE(noise.size() >= defaultNoiseSize);
96 CAFFE_ENFORCE(X.ndim() == 4);
97 const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
99 CAFFE_ENFORCE(mean.size() == kOutputChannels);
101 CAFFE_ENFORCE(C == kInputChannels);
102 Y->Resize(N, kOutputChannels, H, W);
113 Y->mutable_data<
float>());
122 math::RandGaussian<float, CPUContext>(
125 OperatorBase::GetSingleArgument<float>(
"noise_std", 10.0),
126 noise->template mutable_data<float>(),
129 #endif // !__ARM_NEON__ 136 size = math::roundUp(size, kNeonNoiseReadSize) + size;
139 math::RandGaussian<float, CPUContext>(
142 OperatorBase::GetSingleArgument<float>(
"noise_std", 10.0),
143 noise->template mutable_data<float>(),
154 const uint8_t* input,
155 const float* meanChannel,
158 int planeSize = H * W;
160 for (
int n = 0; n < N; ++n) {
161 auto curInput = input + n * kInputChannels * planeSize;
162 auto curOutput = output + n * kOutputChannels * planeSize;
165 runCPUNeon(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
167 runCPU(H, W, noiseCycle, curInput, meanChannel, noise, curOutput);
168 #endif // __ARM_NEON__ 177 const uint8_t* input,
178 const float* meanChannel,
181 int planeSize = H * W;
185 for (
int c = 0; c < kOutputChannels; ++c) {
186 float v = (float)input[
point * kInputChannels + c];
187 output[c * planeSize +
point] = v - meanChannel[c] + noise[noiseOffset];
189 if (++noiseOffset >= noiseCycle) {
195 #endif // !__ARM_NEON__ 202 const uint8_t* input,
203 const float* meanChannel,
211 constexpr
int kUnroll = 1;
214 constexpr
int kInnerLoadSize =
sizeof(uint8x16x4_t);
217 constexpr
int kInnerStoreSize =
sizeof(float32x4_t);
220 constexpr
int kLoadPixels = kInnerLoadSize / kInputChannels;
221 static_assert(kLoadPixels == 16,
"unexpected");
224 constexpr
int kLoadPixelsPerLoop = kLoadPixels * kUnroll;
227 CAFFE_ENFORCE_GE(noiseCycle, kOutputChannels * kLoadPixelsPerLoop);
230 const float* curNoise = noise;
232 float mean[kOutputChannels] = {
233 meanChannel[0], meanChannel[1], meanChannel[2]};
234 int planeSize = H * W;
241 bool isAligned = isPointerAligned(input, kInnerLoadSize) &&
242 isPointerAligned(output, kInnerStoreSize) &&
245 (planeSize % kInnerStoreSize == 0);
249 isAligned ? (planeSize / kLoadPixelsPerLoop) * kLoadPixelsPerLoop : 0;
251 for (; point < limit; point += kLoadPixelsPerLoop) {
253 for (
int j = 0; j < kUnroll; ++j) {
255 const uint8_t* inputAligned = (
const uint8_t*)__builtin_assume_aligned(
256 input + (point + j * kLoadPixels) * kInputChannels,
257 sizeof(uint8x16x4_t));
258 uint8x16x4_t loadV = vld4q_u8(inputAligned);
266 constexpr
int kChannel = 0;
267 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 0);
268 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 4);
269 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 8);
270 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 12);
272 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
273 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
274 outV.val[0] = vsubq_f32(outV.val[0], meanV);
275 outV.val[1] = vsubq_f32(outV.val[1], meanV);
276 outV.val[2] = vsubq_f32(outV.val[2], meanV);
277 outV.val[3] = vsubq_f32(outV.val[3], meanV);
279 outV.val[0] = vaddq_f32(outV.val[0], noise0);
280 outV.val[1] = vaddq_f32(outV.val[1], noise1);
281 outV.val[2] = vaddq_f32(outV.val[2], noise2);
282 outV.val[3] = vaddq_f32(outV.val[3], noise3);
284 float* outputAligned = (
float*)__builtin_assume_aligned(
285 &output[kChannel * planeSize + (point + j * kLoadPixels)],
286 sizeof(float32x4_t));
288 vst1q_f32(outputAligned + 0, outV.val[0]);
289 vst1q_f32(outputAligned + 4, outV.val[1]);
290 vst1q_f32(outputAligned + 8, outV.val[2]);
291 vst1q_f32(outputAligned + 12, outV.val[3]);
295 constexpr
int kChannel = 1;
296 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 16);
297 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 20);
298 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 24);
299 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 28);
301 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
302 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
303 outV.val[0] = vsubq_f32(outV.val[0], meanV);
304 outV.val[1] = vsubq_f32(outV.val[1], meanV);
305 outV.val[2] = vsubq_f32(outV.val[2], meanV);
306 outV.val[3] = vsubq_f32(outV.val[3], meanV);
308 outV.val[0] = vaddq_f32(outV.val[0], noise0);
309 outV.val[1] = vaddq_f32(outV.val[1], noise1);
310 outV.val[2] = vaddq_f32(outV.val[2], noise2);
311 outV.val[3] = vaddq_f32(outV.val[3], noise3);
313 float* outputAligned = (
float*)__builtin_assume_aligned(
314 &output[kChannel * planeSize + (point + j * kLoadPixels)],
315 sizeof(float32x4_t));
317 vst1q_f32(outputAligned + 0, outV.val[0]);
318 vst1q_f32(outputAligned + 4, outV.val[1]);
319 vst1q_f32(outputAligned + 8, outV.val[2]);
320 vst1q_f32(outputAligned + 12, outV.val[3]);
324 constexpr
int kChannel = 2;
325 float32x4_t noise0 = vld1q_f32(curNoise + j * 48 + 32);
326 float32x4_t noise1 = vld1q_f32(curNoise + j * 48 + 36);
327 float32x4_t noise2 = vld1q_f32(curNoise + j * 48 + 40);
328 float32x4_t noise3 = vld1q_f32(curNoise + j * 48 + 44);
330 float32x4x4_t outV = to_f32_v4_x4(loadV.val[kChannel]);
331 float32x4_t meanV = vdupq_n_f32(mean[kChannel]);
332 outV.val[0] = vsubq_f32(outV.val[0], meanV);
333 outV.val[1] = vsubq_f32(outV.val[1], meanV);
334 outV.val[2] = vsubq_f32(outV.val[2], meanV);
335 outV.val[3] = vsubq_f32(outV.val[3], meanV);
337 outV.val[0] = vaddq_f32(outV.val[0], noise0);
338 outV.val[1] = vaddq_f32(outV.val[1], noise1);
339 outV.val[2] = vaddq_f32(outV.val[2], noise2);
340 outV.val[3] = vaddq_f32(outV.val[3], noise3);
342 float* outputAligned = (
float*)__builtin_assume_aligned(
343 &output[kChannel * planeSize + (point + j * kLoadPixels)],
344 sizeof(float32x4_t));
346 vst1q_f32(outputAligned + 0, outV.val[0]);
347 vst1q_f32(outputAligned + 4, outV.val[1]);
348 vst1q_f32(outputAligned + 8, outV.val[2]);
349 vst1q_f32(outputAligned + 12, outV.val[3]);
353 curNoise += (kLoadPixels * kOutputChannels) * kUnroll;
354 noiseUsed += (kLoadPixels * kOutputChannels) * kUnroll;
356 if (noiseUsed >= noiseCycle) {
358 curNoise = noise + ((curNoise - noise) % noiseCycle);
363 for (; point < planeSize; ++point) {
364 for (
int c = 0; c < kOutputChannels; ++c) {
365 float v = (float)input[point * kInputChannels + c];
366 output[c * planeSize + point] = v - mean[c] + *curNoise++;
370 if (noiseUsed >= noiseCycle) {
372 curNoise = noise + ((curNoise - noise) % noiseCycle);
376 #endif // __ARM_NEON__ 384 template <
typename T>
385 static inline T clamped_cast(
float f) {
386 if (f >= std::numeric_limits<T>::max()) {
387 return std::numeric_limits<T>::max();
389 if (f <= std::numeric_limits<T>::min()) {
390 return std::numeric_limits<T>::min();
392 return static_cast<T
>(f);
403 static constexpr
int kInputChannels = 3;
406 static constexpr
int kOutputChannels = 4;
409 const auto& X = Input(0);
410 const auto& mean = Input(1);
412 CAFFE_ENFORCE(X.ndim() == 4);
413 const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
415 CAFFE_ENFORCE(mean.size() == kInputChannels);
416 CAFFE_ENFORCE(C == kInputChannels);
418 Y->Resize(N, H, W, kOutputChannels);
427 Y->mutable_data<uint8_t>());
438 const float* meanChannel,
440 int planeSize = H * W;
442 for (
int n = 0; n < N; ++n) {
443 auto curInput = input + n * kInputChannels * planeSize;
444 auto curOutput = output + n * kOutputChannels * planeSize;
447 runCPUNeon(H, W, curInput, meanChannel, curOutput);
449 runCPU(H, W, curInput, meanChannel, curOutput);
450 #endif // __ARM_NEON__ 459 const float* meanChannel,
461 int planeSize = H * W;
464 for (
int c = 0; c < kInputChannels; ++c) {
465 uint8_t v = clamped_cast<uint8_t>(
466 input[c * planeSize +
point] + meanChannel[c]);
467 output[
point * kOutputChannels + c] = v;
471 output[
point * kOutputChannels + (kOutputChannels - 1)] =
472 std::numeric_limits<uint8_t>::max();
475 #endif // !__ARM_NEON__ 482 const float* meanChannel,
487 constexpr
int kLoadUnit =
sizeof(float32x4_t);
488 constexpr
int kLoadFloats = (
sizeof(float32x4_t) /
sizeof(
float));
491 constexpr
int kStoreUnit =
sizeof(uint8x8x4_t);
494 constexpr
int kLoadPixels = 2 * kLoadFloats;
496 float mean[kInputChannels] = {
497 meanChannel[0], meanChannel[1], meanChannel[2]};
498 int planeSize = H * W;
505 bool isAligned = isPointerAligned(input, kLoadUnit) &&
506 isPointerAligned(output, kStoreUnit) &&
509 (planeSize % kLoadUnit == 0);
512 int limit = isAligned ? (planeSize / kLoadPixels) * kLoadPixels : 0;
514 for (; point < limit; point += kLoadPixels) {
517 float32x4_t inputc0_0 =
518 vld1q_f32_aligned(input + 0 * planeSize + point + 0 * kLoadFloats);
519 float32x4_t inputc0_1 =
520 vld1q_f32_aligned(input + 0 * planeSize + point + 1 * kLoadFloats);
522 float32x4_t inputc1_0 =
523 vld1q_f32_aligned(input + 1 * planeSize + point + 0 * kLoadFloats);
524 float32x4_t inputc1_1 =
525 vld1q_f32_aligned(input + 1 * planeSize + point + 1 * kLoadFloats);
527 float32x4_t inputc2_0 =
528 vld1q_f32_aligned(input + 2 * planeSize + point + 0 * kLoadFloats);
529 float32x4_t inputc2_1 =
530 vld1q_f32_aligned(input + 2 * planeSize + point + 1 * kLoadFloats);
532 addMeanAndClamp(inputc0_0, mean[0]);
533 addMeanAndClamp(inputc0_1, mean[0]);
534 uint8x8_t u8_c0 = convertNarrowAndPack(inputc0_0, inputc0_1);
536 addMeanAndClamp(inputc1_0, mean[1]);
537 addMeanAndClamp(inputc1_1, mean[1]);
538 uint8x8_t u8_c1 = convertNarrowAndPack(inputc1_0, inputc1_1);
540 addMeanAndClamp(inputc2_0, mean[2]);
541 addMeanAndClamp(inputc2_1, mean[2]);
542 uint8x8_t u8_c2 = convertNarrowAndPack(inputc2_0, inputc2_1);
545 uint8x8_t u8_c3 = vdup_n_u8(std::numeric_limits<uint8_t>::max());
549 uint8x8x4_t u8_out = {{ u8_c0, u8_c1, u8_c2, u8_c3 }};
550 vst4_u8_aligned(output + kOutputChannels * point, u8_out);
554 for (; point < planeSize; ++point) {
555 for (
int c = 0; c < kInputChannels; ++c) {
557 clamped_cast<uint8_t>(input[c * planeSize + point] + mean[c]);
558 output[point * kOutputChannels + c] = v;
562 output[point * kOutputChannels + (kOutputChannels - 1)] =
563 std::numeric_limits<uint8_t>::max();
566 #endif // __ARM_NEON__ 571 REGISTER_CPU_OPERATOR(
572 PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
574 OPERATOR_SCHEMA(PackedInt8BGRANHWCToNCHWCStylizerPreprocess)
577 REGISTER_CPU_OPERATOR(
578 BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
580 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
void Resize(Ts...dim_source)
Resizes a tensor.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...