2 #include "DataTransfer.h" 5 #include "caffe2/core/common.h" 7 inline uint16x4x4_t vld4_u16_aligned16(
const uint16_t* address) {
8 return vld4_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 16)));
11 inline uint16x4_t vld1_u16_aligned8(
const uint16_t* address) {
12 return vld1_u16(static_cast<const uint16_t*>(__builtin_assume_aligned(address, 8)));
15 inline void vst4_u16_aligned16(uint16_t* address, uint16x4x4_t data) {
16 vst4_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 16)), data);
19 inline void vst1_u16_aligned8(uint16_t* address, uint16x4_t data) {
20 vst1_u16(static_cast<uint16_t*>(__builtin_assume_aligned(address, 8)), data);
23 template <
int input_channels>
24 static void interleaveSlice(
25 void* output,
const float* input,
size_t width,
size_t height,
size_t row_stride) {
26 const float* input_r = input;
27 const float* input_g = input_r + height * width;
28 const float* input_b = input_g + height * width;
29 const float* input_a = input_b + height * width;
30 uint16_t* output_f16 =
static_cast<uint16_t*
>(output);
32 for (
size_t y = 0; y < height; y++) {
35 const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
38 g = b = a = vdup_n_u16(0);
39 if (input_channels >= 2) {
40 g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
42 if (input_channels >= 3) {
43 b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
45 if (input_channels >= 4) {
46 a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
52 const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
53 vst4_u16_aligned16(output_f16, rgba);
59 output_f16 -= (4 - nx) * 4;
61 if (input_channels >= 2) {
63 if (input_channels >= 3) {
65 if (input_channels >= 4) {
71 const uint16x4_t r = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_r)));
74 g = b = a = vdup_n_u16(0);
75 if (input_channels >= 2) {
76 g = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_g)));
78 if (input_channels >= 3) {
79 b = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_b)));
81 if (input_channels >= 4) {
82 a = uint16x4_t(vcvt_f16_f32(vld1q_f32(input_a)));
88 const uint16x4x4_t rgba = (uint16x4x4_t){{r, g, b, a}};
89 vst4_u16_aligned16(output_f16, rgba);
92 output_f16 += (row_stride - width) * 4;
95 for (
size_t y = 0; y < height; y++) {
96 for (
size_t x = 0; x < width; x++) {
97 float32x4_t rgba = vld1q_dup_f32(input_r++);
98 if (input_channels >= 2) {
99 rgba = vld1q_lane_f32(input_g++, rgba, 1);
100 if (input_channels >= 3) {
101 rgba = vld1q_lane_f32(input_b++, rgba, 2);
102 if (input_channels >= 4) {
103 rgba = vld1q_lane_f32(input_a++, rgba, 3);
107 vst1_u16_aligned8(output_f16, uint16x4_t(vcvt_f16_f32(rgba)));
110 output_f16 += (row_stride - width) * 4;
115 void interleaveSlice(
void* output,
120 uint16_t input_channels) {
121 switch (input_channels) {
123 interleaveSlice<1>(output, input, width, height, row_stride);
126 interleaveSlice<2>(output, input, width, height, row_stride);
129 interleaveSlice<3>(output, input, width, height, row_stride);
132 interleaveSlice<4>(output, input, width, height, row_stride);
137 template <
int output_channels>
138 static void deInterleaveSlice(
139 float* output,
const void* input,
size_t width,
size_t height,
size_t row_stride) {
140 float* output_r = output;
141 float* output_g = output_r + height * width;
142 float* output_b = output_g + height * width;
143 float* output_a = output_b + height * width;
144 const uint16_t* input_f16 =
static_cast<const uint16_t*
>(input);
146 for (
size_t y = 0; y < height; y++) {
149 const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
151 const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
152 vst1q_f32(output_r, r);
154 if (output_channels >= 2) {
155 const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
156 vst1q_f32(output_g, g);
158 if (output_channels >= 3) {
159 const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
160 vst1q_f32(output_b, b);
162 if (output_channels >= 4) {
163 const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
164 vst1q_f32(output_a, a);
173 input_f16 -= (4 - nx) * 4;
175 if (output_channels >= 2) {
177 if (output_channels >= 3) {
179 if (output_channels >= 4) {
185 const uint16x4x4_t rgba = vld4_u16_aligned16(input_f16);
187 const float32x4_t r = vcvt_f32_f16(float16x4_t(rgba.val[0]));
188 vst1q_f32(output_r, r);
190 if (output_channels >= 2) {
191 const float32x4_t g = vcvt_f32_f16(float16x4_t(rgba.val[1]));
192 vst1q_f32(output_g, g);
194 if (output_channels >= 3) {
195 const float32x4_t b = vcvt_f32_f16(float16x4_t(rgba.val[2]));
196 vst1q_f32(output_b, b);
198 if (output_channels >= 4) {
199 const float32x4_t a = vcvt_f32_f16(float16x4_t(rgba.val[3]));
200 vst1q_f32(output_a, a);
206 input_f16 += (row_stride - width) * 4;
209 for (
size_t y = 0; y < height; y++) {
210 for (
size_t x = 0; x < width; x++) {
211 const float32x4_t rgba = vcvt_f32_f16(float16x4_t(vld1_u16_aligned8(input_f16)));
213 vst1q_lane_f32(output_r++, rgba, 0);
214 if (output_channels >= 2) {
215 vst1q_lane_f32(output_g++, rgba, 1);
216 if (output_channels >= 3) {
217 vst1q_lane_f32(output_b++, rgba, 2);
218 if (output_channels >= 4) {
219 vst1q_lane_f32(output_a++, rgba, 3);
224 input_f16 += (row_stride - width) * 4;
229 void deInterleaveSlice(
float* output,
234 uint32_t output_channels) {
235 switch (output_channels) {
237 deInterleaveSlice<1>(output, input, width, height, row_stride);
240 deInterleaveSlice<2>(output, input, width, height, row_stride);
243 deInterleaveSlice<3>(output, input, width, height, row_stride);
246 deInterleaveSlice<4>(output, input, width, height, row_stride);