Caffe2 - C++ API
A deep learning, cross platform ML framework
GLConvolution.cc
1 #include "GLConvolution.h"
2 #include "../core/GLContext.h"
3 #include "../core/ImageAllocator.h"
4 
5 #include "caffe2/core/common.h"
6 #include "caffe2/core/context.h"
7 #include "caffe2/core/timer.h"
8 #include "caffe2/operators/conv_pool_op_base.h"
9 #include "caffe2/operators/conv_transpose_unpool_op_base.h"
10 #include <iostream>
11 #include <vector>
12 
13 #define MaxOutputTileBatchSize 2
14 
15 // MARK: GLSL
16 const char* GLConvolution::fragment_shader = R"GLSL(#version 300 es
17 #define TILED_CONVOLUTION $(TILED_CONVOLUTION)
18 #define TRANSPOSED_CONVOLUTION $(TRANSPOSED_CONVOLUTION)
19 
20 // batching
21 #define INPUT_BATCH_SIZE $(INPUT_BATCH_SIZE)
22 #define OUTPUT_BATCH_SIZE $(OUTPUT_BATCH_SIZE)
23 
24 // tiling
25 #define INPUT_TILES $(INPUT_TILES)
26 #define OUTPUT_TILES $(OUTPUT_TILES)
27 #define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
28 #define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
29 #define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
30 #define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
31 #define INPUT_TILE_X $(INPUT_TILE_X)
32 #define OUTPUT_TILE_X $(OUTPUT_TILE_X)
33 #define INPUT_TILE_CHUNK_SIZE $(INPUT_TILE_CHUNK_SIZE)
34 #define OUTPUT_TILE_CHUNK_SIZE $(OUTPUT_TILE_CHUNK_SIZE)
35 #define OUTPUT_TILE_BATCH_SIZE $(OUTPUT_TILE_BATCH_SIZE)
36 
37 #define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
38 
39 // common
40 const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
41 const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
42 const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
43 
44 precision mediump float;
45 precision mediump int;
46 precision mediump sampler2D;
47 
48 in highp vec2 v_texCoord;
49 
50 #define unpackKernel(pk) \
51  mat4(vec4(unpackHalf2x16(pk.packed_data[0].x), unpackHalf2x16(pk.packed_data[0].y)), \
52  vec4(unpackHalf2x16(pk.packed_data[0].z), unpackHalf2x16(pk.packed_data[0].w)), \
53  vec4(unpackHalf2x16(pk.packed_data[1].x), unpackHalf2x16(pk.packed_data[1].y)), \
54  vec4(unpackHalf2x16(pk.packed_data[1].z), unpackHalf2x16(pk.packed_data[1].w)))
55 
56 #if BOUNDS_CHECK_MODE == 0
57  #define IN_BOUNDS(p, p0, p1) (true)
58 #else
59  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
60 #endif
61 
62 #if TILED_CONVOLUTION
63 // Tiled convolution
64 const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
65 const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
66 
67 uniform ivec2 outputSize;
68 uniform bool accumulate;
69 uniform bool fusePRelu;
70 
71 uniform ivec2 inputTileRange;
72 
73 TEXTURE_INPUT(inputData[1]);
74 TEXTURE_INPUT(previousData[1]);
75 
76 struct packedKernel {
77  highp uvec4 packed_data[2];
78 };
79 
80 struct kernel {
81  packedKernel data[kernel_size.x * kernel_size.y];
82 };
83 
84 layout (std140) uniform Kernel_block {
85  kernel kernel_data[INPUT_TILE_CHUNK_SIZE * OUTPUT_TILE_CHUNK_SIZE];
86 } kernel_block[OUTPUT_TILE_BATCH_SIZE];
87 
88 layout (std140) uniform bias_block {
89  highp uvec4 bias[(OUTPUT_TILES + 1) / 2];
90 };
91 
92 layout (std140) uniform prelu_scale_block {
93  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
94 };
95 
96 TEXTURE_OUTPUT(0, outputData0);
97 
98 #if TRANSPOSED_CONVOLUTION
99 
100 #define CONVOLUTION(ib) { \
101  ivec2 p0 = (input_padding + input_stride - tileCoord % input_stride) % input_stride; \
102  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
103  for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
104  int i = y * kernel_size.x + x; \
105  ivec2 idx = tileCoord + ivec2(x, y) - input_padding; \
106  if IN_BOUNDS(idx, ivec2(0), inputTileSize * input_stride) { \
107  vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx / input_stride); \
108  mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
109  sum += k * data; \
110  } \
111  } \
112  } \
113 }
114 
115 #else
116 
117 #define CONVOLUTION(ib) { \
118  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
119  for (int x = 0; x < kernel_size.x; x++, i++) { \
120  ivec2 idx = tileCoord + ivec2(x, y); \
121  if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
122  vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx); \
123  mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \
124  sum += k * data; \
125  } \
126  } \
127  } \
128 }
129 #endif // TRANSPOSED_CONVOLUTION
130 
131 void main() {
132  ivec2 inputSize = textureSize(inputData[0], 0);
133  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
134 
135  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
136  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
137 
138  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
139 
140 #if !TRANSPOSED_CONVOLUTION
141  tileCoord = input_stride * tileCoord - input_padding;
142 #endif
143 
144  highp vec4 sum = vec4(0);
145 
146  for (int tile_idx = inputTileRange.x; tile_idx < inputTileRange.y; tile_idx++) {
147  int inTileX = tile_idx % INPUT_TILE_X;
148  int inTileY = tile_idx / INPUT_TILE_X;
149  int inTileId = tile_idx % INPUT_TILE_CHUNK_SIZE; // normalized input tile idx, used to index the kernel
150 
151  int kernelIdx = OUTPUT_TILE_CHUNK_SIZE * inTileId + tileNum % OUTPUT_TILE_CHUNK_SIZE;
152  ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
153 
154  int outputChunkIdx = tileNum / OUTPUT_TILE_CHUNK_SIZE;
155  if (outputChunkIdx == 0) {
156  CONVOLUTION(0);
157  }
158 #if OUTPUT_TILE_BATCH_SIZE > 1
159  else if (outputChunkIdx == 1) {
160  CONVOLUTION(1);
161  }
162 #if OUTPUT_TILE_BATCH_SIZE > 2
163  else if (outputChunkIdx == 2) {
164  CONVOLUTION(2);
165  }
166 #if OUTPUT_TILE_BATCH_SIZE > 3
167  else if (outputChunkIdx == 3) {
168  CONVOLUTION(3);
169  }
170 #if OUTPUT_TILE_BATCH_SIZE > 4
171  else if (outputChunkIdx == 4) {
172  CONVOLUTION(4);
173  }
174 #if OUTPUT_TILE_BATCH_SIZE > 5
175  else if (outputChunkIdx == 5) {
176  CONVOLUTION(5);
177  }
178 #if OUTPUT_TILE_BATCH_SIZE > 6
179  else if (outputChunkIdx == 6) {
180  CONVOLUTION(6);
181  }
182 #if OUTPUT_TILE_BATCH_SIZE > 7
183  else if (outputChunkIdx == 7) {
184  CONVOLUTION(7);
185  }
186 #endif
187 #endif
188 #endif
189 #endif
190 #endif
191 #endif
192 #endif
193  }
194 
195  vec4 biasValue = (tileNum % 2 == 0) ? unpackHalf4x16(bias[tileNum/2].xy) : unpackHalf4x16(bias[tileNum/2].zw);
196  vec4 prevData = TEXTURE_LOAD(previousData[0], texelCoord);
197  vec4 value = sum + (accumulate ? prevData : biasValue);
198 
199  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
200 
201  vec4 o0 = fusePRelu ? mix(value * preluValue, value, vec4(greaterThan(value, vec4(0)))) : value;
202  outputData0 = TEXTURE_STORE(o0);
203 }
204 
205 #else
206 
207 // batched convolution
208 
209 uniform ivec2 outputSize;
210 uniform bool accumulate;
211 uniform bool fusePRelu;
212 
213 TEXTURE_INPUT(inputData[INPUT_BATCH_SIZE]);
214 TEXTURE_INPUT(previousData[OUTPUT_BATCH_SIZE]);
215 
216 struct packedKernel {
217  highp uvec4 packed_data[2];
218 };
219 
220 struct kernel {
221  packedKernel data[kernel_size.x * kernel_size.y];
222 };
223 
224 layout (std140) uniform Kernel_block {
225  kernel kernel_data[OUTPUT_BATCH_SIZE];
226 } kernel_block[INPUT_BATCH_SIZE];
227 
228 layout (std140) uniform bias_block {
229  highp uvec4 bias[(OUTPUT_BATCH_SIZE + 1) / 2];
230 };
231 
232 layout (std140) uniform prelu_scale_block {
233  highp uvec4 scale[(OUTPUT_BATCH_SIZE + 1) / 2];
234 };
235 
236 TEXTURE_OUTPUT(0, outputData0);
237 #if OUTPUT_BATCH_SIZE > 1
238 TEXTURE_OUTPUT(1, outputData1);
239 #if OUTPUT_BATCH_SIZE > 2
240 TEXTURE_OUTPUT(2, outputData2);
241 #if OUTPUT_BATCH_SIZE > 3
242 TEXTURE_OUTPUT(3, outputData3);
243 #endif
244 #endif
245 #endif
246 
247 #if TRANSPOSED_CONVOLUTION
248 #define CONVOLUTION(ib) { \
249  ivec2 p0 = (input_padding + input_stride - texelCoord % input_stride) % input_stride; \
250  for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \
251  for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \
252  int i = y * kernel_size.x + x; \
253  ivec2 idx = texelCoord + ivec2(x, y) - input_padding; \
254  if IN_BOUNDS(idx, ivec2(0), inputSize * input_stride) { \
255  vec4 data = TEXTURE_LOAD(inputData[ib], idx / input_stride); \
256  for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
257  mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
258  sum[ob] += k * data; \
259  } \
260  } \
261  } \
262  } \
263 }
264 
265 #else
266 
267 #define CONVOLUTION(ib) { \
268  for (int y = 0, i = 0; y < kernel_size.y; y++) { \
269  for (int x = 0; x < kernel_size.x; x++, i++) { \
270  ivec2 idx = coord + ivec2(x, y); \
271  if IN_BOUNDS(idx, ivec2(0), inputSize) { \
272  vec4 data = TEXTURE_LOAD(inputData[ib], idx); \
273  for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \
274  mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \
275  sum[ob] += k * data; \
276  } \
277  } \
278  } \
279  } \
280 }
281 
282 #endif // TRANSPOSED_CONVOLUTION
283 
284 void main() {
285  ivec2 inputSize = textureSize(inputData[0], 0);
286  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
287 
288 #if !TRANSPOSED_CONVOLUTION
289  ivec2 coord = input_stride * texelCoord - input_padding;
290 #endif
291 
292  highp vec4 sum[OUTPUT_BATCH_SIZE] = vec4[OUTPUT_BATCH_SIZE](vec4(0)
293 #if OUTPUT_BATCH_SIZE > 1
294  , vec4(0)
295 #if OUTPUT_BATCH_SIZE > 2
296  , vec4(0)
297 #if OUTPUT_BATCH_SIZE > 3
298  , vec4(0)
299 #endif
300 #endif
301 #endif
302  );
303 
304  CONVOLUTION(0);
305 #if INPUT_BATCH_SIZE > 1
306  CONVOLUTION(1);
307 #if INPUT_BATCH_SIZE > 2
308  CONVOLUTION(2);
309 #if INPUT_BATCH_SIZE > 3
310  CONVOLUTION(3);
311 #if INPUT_BATCH_SIZE > 4
312  CONVOLUTION(4);
313 #if INPUT_BATCH_SIZE > 5
314  CONVOLUTION(5);
315 #if INPUT_BATCH_SIZE > 6
316  CONVOLUTION(6);
317 #if INPUT_BATCH_SIZE > 7
318  CONVOLUTION(7);
319 #endif
320 #endif
321 #endif
322 #endif
323 #endif
324 #endif
325 #endif
326 
327  vec4 prev0 = TEXTURE_LOAD(previousData[0], texelCoord);
328  vec4 value = sum[0] + (accumulate ? prev0: unpackHalf4x16(bias[0].xy));
329  vec4 o0 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
330  outputData0 = TEXTURE_STORE(o0);
331 #if OUTPUT_BATCH_SIZE > 1
332  vec4 prev1 = TEXTURE_LOAD(previousData[1], texelCoord);
333  value = sum[1] + (accumulate ? prev1 : unpackHalf4x16(bias[0].zw));
334  vec4 o1 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
335  outputData1 = TEXTURE_STORE(o1);
336 #if OUTPUT_BATCH_SIZE > 2
337  vec4 prev2 = TEXTURE_LOAD(previousData[2], texelCoord);
338  value = sum[2] + (accumulate ? prev2 : unpackHalf4x16(bias[1].xy));
339  vec4 o2 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].xy), value, vec4(greaterThan(value, vec4(0)))) : value;
340  outputData2 = TEXTURE_STORE(o2);
341 #if OUTPUT_BATCH_SIZE > 3
342  vec4 prev3 = TEXTURE_LOAD(previousData[3], texelCoord);
343  value = sum[3] + (accumulate ? prev3: unpackHalf4x16(bias[1].zw));
344  vec4 o3 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].zw), value, vec4(greaterThan(value, vec4(0)))) : value;
345  outputData3 = TEXTURE_STORE(o3);
346 #endif
347 #endif
348 #endif
349 }
350 
351 #endif // TILED_CONVOLUTION
352 
353 )GLSL";
354 
355 void GLConvolution::pack_kernel_data_for_bached_conv(
356  float16_t* data,
357  size_t size,
358  int input_channels,
359  int output_channels,
360  int is,
361  int os,
362  int ib) {
363  typedef float16_t(packedKernel)[output_batch_size][geometry.kernel_size.y]
364  [geometry.kernel_size.x][4][4];
365  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
366 
367  const int batch_input_channels = std::min(4, input_channels - 4 * (is + ib));
368  for (int ob = 0; ob < output_batch_size; ob++) {
369  const int batch_output_channels =
370  std::min(4, output_channels - 4 * (os + ob));
371  for (int out = 0; out < batch_output_channels; out++) {
372  for (int in = 0; in < batch_input_channels; in++) {
373  for (int y = 0; y < geometry.kernel_size.y; y++) {
374  for (int x = 0; x < geometry.kernel_size.x; x++) {
375  // clang-format off
376  if (geometry.transposed) {
377  typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
378  const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
379  packed_kernel_data[ob][y][x][in][out] =
380  kernel_data[4 * (is + ib) + in][4 * (os + ob) + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
381  } else {
382  typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
383  const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
384  packed_kernel_data[ob][y][x][in][out] = kernel_data[4 * (os + ob) + out][4 * (is + ib) + in][y][x];
385  }
386  // clang-format on
387  }
388  }
389  }
390  }
391  }
392 }
393 
394 void GLConvolution::pack_kernel_data_for_tiled_conv(
395  float16_t* data, // destination
396  size_t size,
397  int input_channels,
398  int output_channels,
399  point input_tile_range,
400  point output_tile_range) {
401  typedef float16_t(
402  packedKernel)[input_tile_chunk_size][output_tile_chunk_size]
403  [geometry.kernel_size.y][geometry.kernel_size.x][4][4];
404  packedKernel& packed_kernel_data = *reinterpret_cast<packedKernel*>(data);
405 
406  for (int it = input_tile_range.x; it < input_tile_range.y; it++) {
407  for (int ot = output_tile_range.x; ot < output_tile_range.y; ot++) {
408  for (int y = 0; y < geometry.kernel_size.y; y++) {
409  for (int x = 0; x < geometry.kernel_size.x; x++) {
410  for (int out = 0; out < std::min(4, (output_channels - ot * 4));
411  out++) {
412  for (int in = 0; in < std::min(4, (input_channels - it * 4));
413  in++) {
414  // clang-format off
415  if (geometry.transposed) {
416  typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
417  const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
418  packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
419  kernel_data[4 * it + in] [4 * ot + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
420  } else {
421  typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
422  const kernelTensor& kernel_data = *reinterpret_cast<const kernelTensor*>(kernel);
423  packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
424  kernel_data[4 * ot + out][4 * it + in][y][x];
425  }
426  // clang-format on
427  }
428  }
429  }
430  }
431  }
432  }
433 }
434 
435 template <typename T>
436 void GLConvolution::convolution(
437  const GLImageVector<T>& input_images,
438  const GLImageVector<T>& output_images) {
439  if (tiling) {
440  run_tiled_conv(input_images, output_images);
441  } else {
442  run_batched_conv(input_images, output_images);
443  }
444 }
445 
446 template <typename T>
447 void GLConvolution::run_batched_conv(
448  const GLImageVector<T>& input_images,
449  const GLImageVector<T>& output_images) {
450  for (int i = 0; i < input_images.size(); i++) {
451  GLImage<T>* input_image = input_images[i];
452  GLImage<T>* output_image = output_images[i];
453  int input_slices = input_image->slices;
454  int output_slices = output_image->slices;
455 
456  for (int is = 0; is < input_slices; is += input_batch_size) {
457  for (int os = 0; os < output_slices; os += output_batch_size) {
458  const int output_channels_per_batch =
459  std::min(4 * output_batch_size, geometry.output_channels - 4 * os);
460 
461  gl_log(
462  GL_VERBOSE,
463  "GLConvolution::convolution - is: %d, os: %d\n",
464  is,
465  os);
466 
467  // Note the order of the binding point needs to be the same as in the
468  // constructor
469  int binding_point = 0;
470 
471  // bias
472  attach_uniform_buffer<float16_t>(
473  bias_block, binding_point++, [&](float16_t* data, size_t size) {
474  CAFFE_ENFORCE_GE(
475  size,
476  output_channels_per_batch * sizeof(float16_t),
477  "Bias buffer size too small");
478  for (int ob = 0; ob < output_channels_per_batch; ob++) {
479  data[ob] = bias[4 * os + ob];
480  }
481  });
482 
483  // kernel weights
484  for (int ib = 0; ib < input_batch_size; ib++) {
485  attach_uniform_buffer<float16_t>(
486  kernel_block[ib],
487  binding_point++,
488  [&](float16_t* data, size_t size) {
489  CAFFE_ENFORCE_EQ(
490  size,
491  4 * (4 * output_batch_size) * geometry.kernel_size.y *
492  geometry.kernel_size.x * sizeof(float16_t),
493  "Kernel size mismatch");
494  pack_kernel_data_for_bached_conv(
495  data,
496  size,
497  input_image->channels,
498  output_image->channels,
499  is,
500  os,
501  ib);
502  });
503  }
504 
505  // PRelu scale
506  if (prelu_scale != nullptr && is == input_slices - input_batch_size) {
507  attach_uniform_buffer<float16_t>(
508  prelu_scale_block,
509  binding_point++,
510  [&](float16_t* data, size_t size) {
511  CAFFE_ENFORCE_GE(
512  size,
513  output_channels_per_batch * sizeof(float16_t),
514  "PRelu buffer size too small");
515  for (int ob = 0; ob < output_channels_per_batch; ob++) {
516  data[ob] = prelu_scale_size == geometry.output_channels
517  ? prelu_scale[4 * os + ob]
518  : prelu_scale[0];
519  }
520  });
521  }
522 
523  std::vector<texture_attachment> input_attachments;
524  for (int ib = 0; ib < input_batch_size; ib++) {
525  input_attachments.push_back(
526  {input_image->textures[is + ib], inputData[ib]});
527  }
528  for (int ob = 0; ob < output_batch_size; ob++) {
529  input_attachments.push_back(
530  {output_image->textures[os + ob], previousData[ob]});
531  }
532 
533  run(input_attachments,
534  {output_image->textures.begin() + os,
535  output_image->textures.begin() + os + output_batch_size},
536  [&]() {
537  glUniform2i(
538  outputSize->location,
539  output_image->texture_width,
540  output_image->texture_height);
541  glUniform2i(inputTileRange->location, 0, 1);
542  glUniform1i(accumulate->location, is != 0);
543  glUniform1i(
544  fusePRelu->location,
545  prelu_scale != nullptr &&
546  (is == input_slices - input_batch_size));
547  },
548  output_image->texture_width,
549  output_image->texture_height);
550  }
551  }
552  }
553 }
554 
555 template <typename T>
556 void GLConvolution::run_tiled_conv(
557  const GLImageVector<T>& input_images,
558  const GLImageVector<T>& output_images) {
559  for (int i = 0; i < input_images.size(); i++) {
560  GLImage<T>* input_image = input_images[i];
561  GLImage<T>* output_image = output_images[i];
562  int input_slices = input_image->slices;
563  int output_slices = output_image->slices;
564  int input_tile_x = input_image->tile_x;
565  int input_tile_y = input_image->tile_y;
566  int input_tiles = input_image->tile_x * input_image->tile_y;
567  int output_tiles = output_image->tile_x * output_image->tile_y;
568 
569  for (int ib = 0, it = 0; it < input_tiles;
570  ib++, it += input_tile_chunk_size) {
571  // Note the order of the binding point needs to be the same as in the
572  // constructor
573  int binding_point = 0;
574 
575  // bias
576  attach_uniform_buffer<float16_t>(
577  bias_block, binding_point++, [&](float16_t* data, size_t size) {
578  CAFFE_ENFORCE_GE(
579  size,
580  geometry.output_channels * sizeof(float16_t),
581  "Bias buffer size too small");
582  for (int ob = 0; ob < geometry.output_channels; ob++) {
583  data[ob] = bias[ob];
584  }
585  });
586 
587  // kernel weights
588  for (int ob = 0, ot = 0; ot < output_tiles;
589  ob++, ot += output_tile_chunk_size) {
590  attach_uniform_buffer<float16_t>(
591  kernel_block[ob],
592  binding_point++,
593  [&](float16_t* data, size_t size) {
594  CAFFE_ENFORCE_EQ(
595  size,
596  (4 * input_tile_chunk_size) * (4 * output_tile_chunk_size) *
597  geometry.kernel_size.y * geometry.kernel_size.x *
598  sizeof(float16_t),
599  "Kernel size mismatch");
600  pack_kernel_data_for_tiled_conv(
601  data,
602  size,
603  input_image->channels,
604  output_image->channels,
605  {it, std::min(it + input_tile_chunk_size, input_tiles)},
606  {ot, std::min(ot + output_tile_chunk_size, output_tiles)});
607  });
608  }
609 
610  // PRelu scale
611  if (prelu_scale != nullptr && ib == input_tile_batch_size - 1) {
612  attach_uniform_buffer<float16_t>(
613  prelu_scale_block,
614  binding_point++,
615  [&](float16_t* data, size_t size) {
616  CAFFE_ENFORCE_GE(
617  size,
618  geometry.output_channels * sizeof(float16_t),
619  "PRelu buffer size too small");
620  for (int ob = 0; ob < geometry.output_channels; ob++) {
621  data[ob] = prelu_scale_size == geometry.output_channels
622  ? prelu_scale[ob]
623  : prelu_scale[0];
624  }
625  });
626  }
627 
628  std::vector<texture_attachment> input_attachments(
629  {{input_image->textures[0], inputData[0]},
630  {output_image->textures[0], previousData[0]}});
631 
632  run(input_attachments,
633  {output_image->textures[0]},
634  [&]() {
635  glUniform2i(
636  outputSize->location,
637  output_image->texture_width,
638  output_image->texture_height);
639  // [inputTileFrom, inputTileTo)
640  glUniform2i(
641  inputTileRange->location,
642  it,
643  std::min(it + input_tile_chunk_size, input_tiles));
644 
645  glUniform1i(accumulate->location, it != 0);
646  glUniform1i(
647  fusePRelu->location,
648  prelu_scale != nullptr && (ib == input_tile_batch_size - 1));
649  },
650  output_image->texture_width,
651  output_image->texture_height);
652  }
653  }
654 }
655 
656 namespace caffe2 {
657 
658 template <typename OPBase>
659 static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
660  Tensor<CPUContext> input, output;
661  input.Resize(1, 1, H, W);
662  op->SetOutputSize(input, &output, 1);
663  CAFFE_ENFORCE_EQ(output.ndim(), 4);
664  *OH = output.dim(2);
665  *OW = output.dim(3);
666 }
667 
668 static int computeOutputTileChunkSize(int output_tile_x,
669  int output_tile_y,
670  int kernel_width,
671  int kernel_height) {
672  static const int maxUniformBlockBufferSize = 16 * 1024;
673  return std::min(
674  output_tile_x * output_tile_y,
675  maxUniformBlockBufferSize / 4 /
676  (4 * kernel_width * kernel_height * (int)sizeof(float16_t)));
677 }
678 
679 static int computeInputTileChunkSize(
680  int input_tile_x,
681  int input_tile_y,
682  int output_tile_chunk_size,
683  int kernel_width,
684  int kernel_height) {
685  static const int maxUniformBlockBufferSize = 16 * 1024;
686  return std::min(
687  input_tile_x * input_tile_y,
688  maxUniformBlockBufferSize / 4 /
689  (4 * output_tile_chunk_size * kernel_width * kernel_height *
690  (int)sizeof(float16_t)));
691 }
692 
693 // Todo: optimize input/output batch size and use of uniforms/textures for
694 // kernel data
695 static void computeBatchSizes(
696  GLConvolution::descriptor& geometry,
697  int& input_batch_size,
698  int& output_batch_size) {
699  int kernel_size = std::max(geometry.kernel_size.x, geometry.kernel_size.y);
700  int input_slices = (geometry.input_channels + 3) / 4;
701  int output_slices = (geometry.output_channels + 3) / 4;
702 
703 #if CAFFE2_ANDROID
704  input_batch_size = input_slices % 2 == 0 ? 2 : 1;
705  output_batch_size = output_slices % 2 == 0 ? 2 : 1;
706 #else
707  if (iPhoneVersion() >= 8) {
708  // iPhone 6S and up
709  input_batch_size =
710  /* input_slices % 8 == 0 ? 8 : */ input_slices % 4 == 0
711  ? 4
712  : input_slices % 3 == 0 ? 3 : input_slices % 2 == 0 ? 2 : 1;
713  output_batch_size = output_slices % 4 == 0
714  ? 4
715  : output_slices % 3 == 0 ? 3 : output_slices % 2 == 0 ? 2 : 1;
716  }
717 #endif
718 }
719 
720 template <class T, bool fusePRelu, bool fuseRelu>
721 class OpenGLConvOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<T> {
722  public:
723  USE_OPERATOR_BASE_FUNCTIONS;
724  OpenGLConvOp(const OperatorDef& operator_def, Workspace* ws)
725  : ConvPoolOpBase<CPUContext>(operator_def, ws) {
726  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
727  OPERATOR_NEEDS_FEATURE(group_ == 1, "OpenGL only supports group == 1");
728  OPERATOR_NEEDS_FEATURE(
729  dilation_h() == 1 && dilation_w() == 1,
730  "OpenGL only supports dialation == 1");
731  }
732 
733  bool RunOnDeviceWithOrderNCHW() override {
734  const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
735  auto& filter = Input(FILTER);
736  auto& bias = Input(BIAS);
737 
738  const int num_images = input.size();
739  const int input_channels = input.channels();
740  const int input_width = input.width();
741  const int input_height = input.height();
742 
743  CAFFE_ENFORCE(filter.ndim(), 4);
744  const int M = filter.dim32(0);
745  const int kernel_width = filter.dim32(2);
746  const int kernel_height = filter.dim32(3);
747 
748  CAFFE_ENFORCE(filter.dim32(1) == input_channels, "");
749  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "");
750  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "");
751  CAFFE_ENFORCE(bias.ndim() == 1, "");
752  CAFFE_ENFORCE(bias.dim32(0) == M, "");
753 
754  int output_height;
755  int output_width;
756  const int output_channels = M;
757  computeOutputHW(this, input_height, input_width, &output_height, &output_width);
758 
759  float val = 0;
760  const float* prelu_scale = nullptr;
761  int prelu_scale_size = 0;
762  if (fusePRelu) {
763  auto& prelu = Input(PRELU);
764  prelu_scale = prelu.template data<float>();
765  prelu_scale_size = prelu.size();
766  } else if (fuseRelu) {
767  prelu_scale = &val;
768  prelu_scale_size = 1;
769  }
770 
771  const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
772  int output_tile_x = 1, output_tile_y = 1;
773  int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
774  int input_tile_chunk_size = 1, output_tile_chunk_size = 1;
775  int input_tile_batch_size = 1, output_tile_batch_size = 1;
776 
777  const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
778 
779  if (tiling) {
780  // Turn on tiling
781  CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
782  computeOutputTiles(output_channels, output_tile_x, output_tile_y);
783  output_tiles = output_tile_x * output_tile_y;
784 
785  output_tile_chunk_size = computeOutputTileChunkSize(
786  output_tile_x, output_tile_y, kernel_width, kernel_height);
787  output_tile_batch_size = std::max(
788  MaxOutputTileBatchSize,
789  (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
790  output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
791  output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
792 
793  input_tile_chunk_size = computeInputTileChunkSize(
794  input_tile_x,
795  input_tile_y,
796  output_tile_chunk_size,
797  kernel_width,
798  kernel_height);
799  input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
800  // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
801  // input_tile_batch_size;
802  }
803  CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
804  CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
805  CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
806 
807  int is_last = GetSingleArgument<int>("is_last", 0);
808 
810  num_images,
811  output_width,
812  output_height,
813  output_channels,
814  output_tile_x,
815  output_tile_y,
816  is_last);
817 
818  // TODO: figure out the dilation business
819  GLConvolution::descriptor geometry{input_channels,
820  output_channels,
821  {kernel_width, kernel_height},
822  {input_width, input_height},
823  {output_width, output_height},
824  {input_tile_x, input_tile_y},
825  {output_tile_x, output_tile_y},
826  {pad_l(), pad_t()},
827  {stride_w(), stride_h()},
828  false};
829 
830  if (!conv) {
831  int input_batch_size = 1, output_batch_size = 1;
832  if (!tiling) {
833  computeBatchSizes(geometry, input_batch_size, output_batch_size);
834  input_batch_size =
835  GetSingleArgument<int>("input_batch_size", input_batch_size);
836  output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
837  }
838 
839  LOG(INFO) << input_channels << ": " << input_height << " X "
840  << input_width << " => " << output_channels << ": "
841  << output_height << " X " << output_width
842  << " Kernel: " << kernel_width << "X" << kernel_height;
843  if (tiling) {
844  LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
845  << " => " << output_tile_x << " X " << output_tile_y
846  << ", Texture size: " << input_width * input_tile_x << " X "
847  << input_height * input_tile_y << " => "
848  << output_width * output_tile_x << " X "
849  << output_height * output_tile_y
850  << ", Input tile batch size: " << input_tile_batch_size;
851  } else {
852  LOG(INFO) << "input_batch_size = " << input_batch_size
853  << ", output_batch_size = " << output_batch_size;
854  }
855 
856  conv.reset(new GLConvolution(geometry,
857  filter.template data<float>(),
858  bias.template data<float>(),
859  prelu_scale,
860  prelu_scale_size,
861  input_batch_size,
862  output_batch_size,
863  input_tiles,
864  output_tiles,
865  input_tile_chunk_size,
866  output_tile_chunk_size,
867  input_tile_batch_size,
868  output_tile_batch_size,
869  tiling));
870  }
871 
872  conv->convolution(input, *output);
873 
874  Outputs()[0]->Reset(output);
875 
876  return true;
877  }
878 
879  private:
880  std::unique_ptr<GLConvolution> conv;
881 
882  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
883 };
884 
885 REGISTER_CPU_OPERATOR(OpenGLConv, OpenGLConvOp<float16_t, false, false>);
886 OPERATOR_SCHEMA(OpenGLConv).NumInputs(3).NumOutputs(1);
887 
888 REGISTER_CPU_OPERATOR(OpenGLConvPRelu, OpenGLConvOp<float16_t, true, false>);
889 OPERATOR_SCHEMA(OpenGLConvPRelu).NumInputs(4).NumOutputs(1);
890 
891 REGISTER_CPU_OPERATOR(OpenGLConvRelu, OpenGLConvOp<float16_t, false, true>);
892 OPERATOR_SCHEMA(OpenGLConvRelu).NumInputs(3).NumOutputs(1);
893 
894 template <class T, bool fusePRelu, bool fuseRelu>
895 class OpenGLConvTransposeOp final : public ConvTransposeUnpoolBase<CPUContext>, ImageAllocator<T> {
896  public:
897  USE_OPERATOR_BASE_FUNCTIONS;
898  OpenGLConvTransposeOp(const OperatorDef& operator_def, Workspace* ws)
899  : ConvTransposeUnpoolBase<CPUContext>(operator_def, ws) {
900  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
901  OPERATOR_NEEDS_FEATURE(
902  adj_h() == 0 && adj_w() == 0,
903  "OpenGL only supports adj_h == 1 and adj_w == 1");
904  }
905 
906  bool RunOnDeviceWithOrderNCHW() override {
907  const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
908  auto& filter = Input(FILTER);
909  auto& bias = Input(BIAS);
910 
911  const int num_images = input.size();
912  const int input_channels = input.channels();
913  const int input_width = input.width();
914  const int input_height = input.height();
915 
916  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
917  const int M = filter.dim32(0);
918  const int C = filter.dim32(1);
919  const int kernel_width = filter.dim32(2);
920  const int kernel_height = filter.dim32(3);
921 
922  CAFFE_ENFORCE(input_channels == M, "filter number must be equal to input channel number");
923  CAFFE_ENFORCE(filter.dim32(2) == kernel_h(), "filter height must be equal to kernel height");
924  CAFFE_ENFORCE(filter.dim32(3) == kernel_w(), "filter width must be equal to kernel width");
925  CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
926  CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
927 
928  int output_height;
929  int output_width;
930  const int output_channels = C;
931  computeOutputHW(this, input_height, input_width, &output_height, &output_width);
932 
933  float val = 0;
934  const float* prelu_scale = nullptr;
935  int prelu_scale_size = 0;
936  if (fusePRelu) {
937  auto& prelu = Input(PRELU);
938  prelu_scale = prelu.template data<float>();
939  prelu_scale_size = prelu.size();
940  } else if (fuseRelu) {
941  prelu_scale = &val;
942  prelu_scale_size = 1;
943  }
944 
945  const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
946  int output_tile_x = 1, output_tile_y = 1;
947  int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
948  int input_tile_chunk_size = 1, output_tile_chunk_size = 1,
949  input_tile_batch_size = 1, output_tile_batch_size = 1;
950 
951  const bool tiling = GetSingleArgument<int>("tiling", input_tile_x > 1 || input_tile_y > 1);
952 
953  if (tiling) {
954  // Turn on tiling
955  CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
956  computeOutputTiles(output_channels, output_tile_x, output_tile_y);
957  output_tiles = output_tile_x * output_tile_y;
958 
959  output_tile_chunk_size = computeOutputTileChunkSize(
960  output_tile_x, output_tile_y, kernel_width, kernel_height);
961  output_tile_batch_size = std::max(
962  MaxOutputTileBatchSize,
963  (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
964  output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
965  output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
966 
967  input_tile_chunk_size = computeInputTileChunkSize(
968  input_tile_x,
969  input_tile_y,
970  output_tile_chunk_size,
971  kernel_width,
972  kernel_height);
973  input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
974  // input_tile_chunk_size = (input_tiles + input_tile_batch_size - 1) /
975  // input_tile_batch_size;
976  }
977  CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
978  CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
979  CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
980 
981  int is_last = GetSingleArgument<int>("is_last", 0);
982 
984  num_images,
985  output_width,
986  output_height,
987  output_channels,
988  output_tile_x,
989  output_tile_y,
990  is_last);
991 
992  // TODO: figure out the adj business
993  GLConvolution::descriptor geometry{input_channels,
994  output_channels,
995  {kernel_width, kernel_height},
996  {input_width, input_height},
997  {output_width, output_height},
998  {input_tile_x, input_tile_y},
999  {output_tile_x, output_tile_y},
1000  {pad_l(), pad_t()},
1001  {stride_w(), stride_h()},
1002  true};
1003 
1004  if (!conv) {
1005  int input_batch_size = 1, output_batch_size = 1;
1006  if (!tiling) {
1007  computeBatchSizes(geometry, input_batch_size, output_batch_size);
1008  input_batch_size =
1009  GetSingleArgument<int>("input_batch_size", input_batch_size);
1010  output_batch_size = GetSingleArgument<int>("output_batch_size", output_batch_size);
1011  }
1012 
1013  LOG(INFO) << input_channels << ": " << input_height << " X "
1014  << input_width << " => " << output_channels << ": "
1015  << output_height << " X " << output_width
1016  << " Kernel: " << kernel_width << "X" << kernel_height;
1017 
1018  if (tiling) {
1019  LOG(INFO) << "Tiling: " << input_tile_x << " X " << input_tile_y
1020  << " => " << output_tile_x << " X " << output_tile_y
1021  << ", Texture size: " << input_width * input_tile_x << " X "
1022  << input_height * input_tile_y << " => "
1023  << output_width * output_tile_x << " X "
1024  << output_height * output_tile_y
1025  << ", Input tile batch size: " << input_tile_batch_size;
1026  } else {
1027  LOG(INFO) << "input_batch_size = " << input_batch_size
1028  << ", output_batch_size = " << output_batch_size;
1029  }
1030 
1031  conv.reset(new GLConvolution(geometry,
1032  filter.template data<float>(),
1033  bias.template data<float>(),
1034  prelu_scale,
1035  prelu_scale_size,
1036  input_batch_size,
1037  output_batch_size,
1038  input.tile_x() * input.tile_y(),
1039  output->tile_x() * output->tile_y(),
1040  input_tile_chunk_size,
1041  output_tile_chunk_size,
1042  input_tile_batch_size,
1043  output_tile_batch_size,
1044  tiling));
1045  }
1046 
1047  conv->convolution(input, *output);
1048 
1049  Outputs()[0]->Reset(output);
1050 
1051  return true;
1052  }
1053 
1054  private:
1055  std::unique_ptr<GLConvolution> conv;
1056 
1057  INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
1058 };
1059 
1060 REGISTER_CPU_OPERATOR(OpenGLConvTranspose, OpenGLConvTransposeOp<float16_t, false, false>);
1061 OPERATOR_SCHEMA(OpenGLConvTranspose).NumInputs(3).NumOutputs(1);
1062 
1063 REGISTER_CPU_OPERATOR(OpenGLConvTransposePRelu, OpenGLConvTransposeOp<float16_t, true, false>);
1064 OPERATOR_SCHEMA(OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1);
1065 
1066 REGISTER_CPU_OPERATOR(OpenGLConvTransposeRelu, OpenGLConvTransposeOp<float16_t, false, true>);
1067 OPERATOR_SCHEMA(OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1);
1068 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...