1 #include "GLConvolution.h" 2 #include "../core/GLContext.h" 3 #include "../core/ImageAllocator.h" 5 #include "caffe2/core/common.h" 6 #include "caffe2/core/context.h" 7 #include "caffe2/core/timer.h" 8 #include "caffe2/operators/conv_pool_op_base.h" 9 #include "caffe2/operators/conv_transpose_unpool_op_base.h" 13 #define MaxOutputTileBatchSize 2 16 const char* GLConvolution::fragment_shader = R
"GLSL(#version 300 es 17 #define TILED_CONVOLUTION $(TILED_CONVOLUTION) 18 #define TRANSPOSED_CONVOLUTION $(TRANSPOSED_CONVOLUTION) 21 #define INPUT_BATCH_SIZE $(INPUT_BATCH_SIZE) 22 #define OUTPUT_BATCH_SIZE $(OUTPUT_BATCH_SIZE) 25 #define INPUT_TILES $(INPUT_TILES) 26 #define OUTPUT_TILES $(OUTPUT_TILES) 27 #define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH) 28 #define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT) 29 #define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH) 30 #define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT) 31 #define INPUT_TILE_X $(INPUT_TILE_X) 32 #define OUTPUT_TILE_X $(OUTPUT_TILE_X) 33 #define INPUT_TILE_CHUNK_SIZE $(INPUT_TILE_CHUNK_SIZE) 34 #define OUTPUT_TILE_CHUNK_SIZE $(OUTPUT_TILE_CHUNK_SIZE) 35 #define OUTPUT_TILE_BATCH_SIZE $(OUTPUT_TILE_BATCH_SIZE) 37 #define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE) 40 const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y)); 41 const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y)); 42 const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y)); 44 precision mediump float; 45 precision mediump int; 46 precision mediump sampler2D; 48 in highp vec2 v_texCoord; 50 #define unpackKernel(pk) \ 51 mat4(vec4(unpackHalf2x16(pk.packed_data[0].x), unpackHalf2x16(pk.packed_data[0].y)), \ 52 vec4(unpackHalf2x16(pk.packed_data[0].z), unpackHalf2x16(pk.packed_data[0].w)), \ 53 vec4(unpackHalf2x16(pk.packed_data[1].x), unpackHalf2x16(pk.packed_data[1].y)), \ 54 vec4(unpackHalf2x16(pk.packed_data[1].z), unpackHalf2x16(pk.packed_data[1].w))) 56 #if BOUNDS_CHECK_MODE == 0 57 #define IN_BOUNDS(p, p0, p1) (true) 59 #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1))) 64 const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT); 65 const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT); 67 uniform ivec2 outputSize; 68 uniform bool accumulate; 69 uniform bool fusePRelu; 71 uniform ivec2 inputTileRange; 73 TEXTURE_INPUT(inputData[1]); 74 TEXTURE_INPUT(previousData[1]); 77 highp uvec4 packed_data[2]; 81 packedKernel data[kernel_size.x * kernel_size.y]; 84 layout (std140) uniform Kernel_block { 85 kernel kernel_data[INPUT_TILE_CHUNK_SIZE * OUTPUT_TILE_CHUNK_SIZE]; 86 } kernel_block[OUTPUT_TILE_BATCH_SIZE]; 88 layout (std140) uniform bias_block { 89 highp uvec4 bias[(OUTPUT_TILES + 1) / 2]; 92 layout (std140) uniform prelu_scale_block { 93 highp uvec4 scale[(OUTPUT_TILES + 1) / 2]; 96 TEXTURE_OUTPUT(0, outputData0); 98 #if TRANSPOSED_CONVOLUTION 100 #define CONVOLUTION(ib) { \ 101 ivec2 p0 = (input_padding + input_stride - tileCoord % input_stride) % input_stride; \ 102 for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \ 103 for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \ 104 int i = y * kernel_size.x + x; \ 105 ivec2 idx = tileCoord + ivec2(x, y) - input_padding; \ 106 if IN_BOUNDS(idx, ivec2(0), inputTileSize * input_stride) { \ 107 vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx / input_stride); \ 108 mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \ 117 #define CONVOLUTION(ib) { \ 118 for (int y = 0, i = 0; y < kernel_size.y; y++) { \ 119 for (int x = 0; x < kernel_size.x; x++, i++) { \ 120 ivec2 idx = tileCoord + ivec2(x, y); \ 121 if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \ 122 vec4 data = TEXTURE_LOAD(inputData[0], inputTileOffset + idx); \ 123 mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[kernelIdx].data[i]); \ 129 #endif // TRANSPOSED_CONVOLUTION 132 ivec2 inputSize = textureSize(inputData[0], 0); 133 ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)); 135 ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx 136 ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates 138 int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx 140 #if !TRANSPOSED_CONVOLUTION 141 tileCoord = input_stride * tileCoord - input_padding; 144 highp vec4 sum = vec4(0); 146 for (int tile_idx = inputTileRange.x; tile_idx < inputTileRange.y; tile_idx++) { 147 int inTileX = tile_idx % INPUT_TILE_X; 148 int inTileY = tile_idx / INPUT_TILE_X; 149 int inTileId = tile_idx % INPUT_TILE_CHUNK_SIZE; // normalized input tile idx, used to index the kernel 151 int kernelIdx = OUTPUT_TILE_CHUNK_SIZE * inTileId + tileNum % OUTPUT_TILE_CHUNK_SIZE; 152 ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize; 154 int outputChunkIdx = tileNum / OUTPUT_TILE_CHUNK_SIZE; 155 if (outputChunkIdx == 0) { 158 #if OUTPUT_TILE_BATCH_SIZE > 1 159 else if (outputChunkIdx == 1) { 162 #if OUTPUT_TILE_BATCH_SIZE > 2 163 else if (outputChunkIdx == 2) { 166 #if OUTPUT_TILE_BATCH_SIZE > 3 167 else if (outputChunkIdx == 3) { 170 #if OUTPUT_TILE_BATCH_SIZE > 4 171 else if (outputChunkIdx == 4) { 174 #if OUTPUT_TILE_BATCH_SIZE > 5 175 else if (outputChunkIdx == 5) { 178 #if OUTPUT_TILE_BATCH_SIZE > 6 179 else if (outputChunkIdx == 6) { 182 #if OUTPUT_TILE_BATCH_SIZE > 7 183 else if (outputChunkIdx == 7) { 195 vec4 biasValue = (tileNum % 2 == 0) ? unpackHalf4x16(bias[tileNum/2].xy) : unpackHalf4x16(bias[tileNum/2].zw); 196 vec4 prevData = TEXTURE_LOAD(previousData[0], texelCoord); 197 vec4 value = sum + (accumulate ? prevData : biasValue); 199 vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw); 201 vec4 o0 = fusePRelu ? mix(value * preluValue, value, vec4(greaterThan(value, vec4(0)))) : value; 202 outputData0 = TEXTURE_STORE(o0); 207 // batched convolution 209 uniform ivec2 outputSize; 210 uniform bool accumulate; 211 uniform bool fusePRelu; 213 TEXTURE_INPUT(inputData[INPUT_BATCH_SIZE]); 214 TEXTURE_INPUT(previousData[OUTPUT_BATCH_SIZE]); 216 struct packedKernel { 217 highp uvec4 packed_data[2]; 221 packedKernel data[kernel_size.x * kernel_size.y]; 224 layout (std140) uniform Kernel_block { 225 kernel kernel_data[OUTPUT_BATCH_SIZE]; 226 } kernel_block[INPUT_BATCH_SIZE]; 228 layout (std140) uniform bias_block { 229 highp uvec4 bias[(OUTPUT_BATCH_SIZE + 1) / 2]; 232 layout (std140) uniform prelu_scale_block { 233 highp uvec4 scale[(OUTPUT_BATCH_SIZE + 1) / 2]; 236 TEXTURE_OUTPUT(0, outputData0); 237 #if OUTPUT_BATCH_SIZE > 1 238 TEXTURE_OUTPUT(1, outputData1); 239 #if OUTPUT_BATCH_SIZE > 2 240 TEXTURE_OUTPUT(2, outputData2); 241 #if OUTPUT_BATCH_SIZE > 3 242 TEXTURE_OUTPUT(3, outputData3); 247 #if TRANSPOSED_CONVOLUTION 248 #define CONVOLUTION(ib) { \ 249 ivec2 p0 = (input_padding + input_stride - texelCoord % input_stride) % input_stride; \ 250 for (int y = p0.y; y < kernel_size.y; y += input_stride.y) { \ 251 for (int x = p0.x; x < kernel_size.x; x += input_stride.x) { \ 252 int i = y * kernel_size.x + x; \ 253 ivec2 idx = texelCoord + ivec2(x, y) - input_padding; \ 254 if IN_BOUNDS(idx, ivec2(0), inputSize * input_stride) { \ 255 vec4 data = TEXTURE_LOAD(inputData[ib], idx / input_stride); \ 256 for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \ 257 mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \ 258 sum[ob] += k * data; \ 267 #define CONVOLUTION(ib) { \ 268 for (int y = 0, i = 0; y < kernel_size.y; y++) { \ 269 for (int x = 0; x < kernel_size.x; x++, i++) { \ 270 ivec2 idx = coord + ivec2(x, y); \ 271 if IN_BOUNDS(idx, ivec2(0), inputSize) { \ 272 vec4 data = TEXTURE_LOAD(inputData[ib], idx); \ 273 for (int ob = 0; ob < OUTPUT_BATCH_SIZE; ob++) { \ 274 mediump mat4 k = unpackKernel(kernel_block[ib].kernel_data[ob].data[i]); \ 275 sum[ob] += k * data; \ 282 #endif // TRANSPOSED_CONVOLUTION 285 ivec2 inputSize = textureSize(inputData[0], 0); 286 ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)); 288 #if !TRANSPOSED_CONVOLUTION 289 ivec2 coord = input_stride * texelCoord - input_padding; 292 highp vec4 sum[OUTPUT_BATCH_SIZE] = vec4[OUTPUT_BATCH_SIZE](vec4(0) 293 #if OUTPUT_BATCH_SIZE > 1 295 #if OUTPUT_BATCH_SIZE > 2 297 #if OUTPUT_BATCH_SIZE > 3 305 #if INPUT_BATCH_SIZE > 1 307 #if INPUT_BATCH_SIZE > 2 309 #if INPUT_BATCH_SIZE > 3 311 #if INPUT_BATCH_SIZE > 4 313 #if INPUT_BATCH_SIZE > 5 315 #if INPUT_BATCH_SIZE > 6 317 #if INPUT_BATCH_SIZE > 7 327 vec4 prev0 = TEXTURE_LOAD(previousData[0], texelCoord); 328 vec4 value = sum[0] + (accumulate ? prev0: unpackHalf4x16(bias[0].xy)); 329 vec4 o0 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].xy), value, vec4(greaterThan(value, vec4(0)))) : value; 330 outputData0 = TEXTURE_STORE(o0); 331 #if OUTPUT_BATCH_SIZE > 1 332 vec4 prev1 = TEXTURE_LOAD(previousData[1], texelCoord); 333 value = sum[1] + (accumulate ? prev1 : unpackHalf4x16(bias[0].zw)); 334 vec4 o1 = fusePRelu ? mix(value * unpackHalf4x16(scale[0].zw), value, vec4(greaterThan(value, vec4(0)))) : value; 335 outputData1 = TEXTURE_STORE(o1); 336 #if OUTPUT_BATCH_SIZE > 2 337 vec4 prev2 = TEXTURE_LOAD(previousData[2], texelCoord); 338 value = sum[2] + (accumulate ? prev2 : unpackHalf4x16(bias[1].xy)); 339 vec4 o2 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].xy), value, vec4(greaterThan(value, vec4(0)))) : value; 340 outputData2 = TEXTURE_STORE(o2); 341 #if OUTPUT_BATCH_SIZE > 3 342 vec4 prev3 = TEXTURE_LOAD(previousData[3], texelCoord); 343 value = sum[3] + (accumulate ? prev3: unpackHalf4x16(bias[1].zw)); 344 vec4 o3 = fusePRelu ? mix(value * unpackHalf4x16(scale[1].zw), value, vec4(greaterThan(value, vec4(0)))) : value; 345 outputData3 = TEXTURE_STORE(o3); 351 #endif // TILED_CONVOLUTION 355 void GLConvolution::pack_kernel_data_for_bached_conv(
363 typedef float16_t(packedKernel)[output_batch_size][geometry.kernel_size.y]
364 [geometry.kernel_size.x][4][4];
365 packedKernel& packed_kernel_data = *
reinterpret_cast<packedKernel*
>(data);
367 const int batch_input_channels = std::min(4, input_channels - 4 * (is + ib));
368 for (
int ob = 0; ob < output_batch_size; ob++) {
369 const int batch_output_channels =
370 std::min(4, output_channels - 4 * (os + ob));
371 for (
int out = 0; out < batch_output_channels; out++) {
372 for (
int in = 0; in < batch_input_channels; in++) {
373 for (
int y = 0; y < geometry.kernel_size.y; y++) {
374 for (
int x = 0; x < geometry.kernel_size.x; x++) {
376 if (geometry.transposed) {
377 typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
378 const kernelTensor& kernel_data = *
reinterpret_cast<const kernelTensor*
>(kernel);
379 packed_kernel_data[ob][y][x][in][out] =
380 kernel_data[4 * (is + ib) + in][4 * (os + ob) + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
382 typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
383 const kernelTensor& kernel_data = *
reinterpret_cast<const kernelTensor*
>(kernel);
384 packed_kernel_data[ob][y][x][in][out] = kernel_data[4 * (os + ob) + out][4 * (is + ib) + in][y][x];
394 void GLConvolution::pack_kernel_data_for_tiled_conv(
399 point input_tile_range,
400 point output_tile_range) {
402 packedKernel)[input_tile_chunk_size][output_tile_chunk_size]
403 [geometry.kernel_size.y][geometry.kernel_size.x][4][4];
404 packedKernel& packed_kernel_data = *
reinterpret_cast<packedKernel*
>(data);
406 for (
int it = input_tile_range.x; it < input_tile_range.y; it++) {
407 for (
int ot = output_tile_range.x; ot < output_tile_range.y; ot++) {
408 for (
int y = 0; y < geometry.kernel_size.y; y++) {
409 for (
int x = 0; x < geometry.kernel_size.x; x++) {
410 for (
int out = 0; out < std::min(4, (output_channels - ot * 4));
412 for (
int in = 0; in < std::min(4, (input_channels - it * 4));
415 if (geometry.transposed) {
416 typedef float(kernelTensor)[input_channels][output_channels][geometry.kernel_size.y][geometry.kernel_size.x];
417 const kernelTensor& kernel_data = *
reinterpret_cast<const kernelTensor*
>(kernel);
418 packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
419 kernel_data[4 * it + in] [4 * ot + out][geometry.kernel_size.y - 1 - y][geometry.kernel_size.x - 1 - x];
421 typedef float(kernelTensor)[output_channels][input_channels][geometry.kernel_size.y][geometry.kernel_size.x];
422 const kernelTensor& kernel_data = *
reinterpret_cast<const kernelTensor*
>(kernel);
423 packed_kernel_data[it - input_tile_range.x][ot - output_tile_range.x][y][x][in][out] =
424 kernel_data[4 * ot + out][4 * it + in][y][x];
435 template <
typename T>
436 void GLConvolution::convolution(
440 run_tiled_conv(input_images, output_images);
442 run_batched_conv(input_images, output_images);
446 template <
typename T>
447 void GLConvolution::run_batched_conv(
450 for (
int i = 0; i < input_images.size(); i++) {
453 int input_slices = input_image->slices;
454 int output_slices = output_image->slices;
456 for (
int is = 0; is < input_slices; is += input_batch_size) {
457 for (
int os = 0; os < output_slices; os += output_batch_size) {
458 const int output_channels_per_batch =
459 std::min(4 * output_batch_size, geometry.output_channels - 4 * os);
463 "GLConvolution::convolution - is: %d, os: %d\n",
469 int binding_point = 0;
472 attach_uniform_buffer<float16_t>(
473 bias_block, binding_point++, [&](float16_t* data,
size_t size) {
476 output_channels_per_batch *
sizeof(float16_t),
477 "Bias buffer size too small");
478 for (
int ob = 0; ob < output_channels_per_batch; ob++) {
479 data[ob] = bias[4 * os + ob];
484 for (
int ib = 0; ib < input_batch_size; ib++) {
485 attach_uniform_buffer<float16_t>(
488 [&](float16_t* data,
size_t size) {
491 4 * (4 * output_batch_size) * geometry.kernel_size.y *
492 geometry.kernel_size.x *
sizeof(float16_t),
493 "Kernel size mismatch");
494 pack_kernel_data_for_bached_conv(
497 input_image->channels,
498 output_image->channels,
506 if (prelu_scale !=
nullptr && is == input_slices - input_batch_size) {
507 attach_uniform_buffer<float16_t>(
510 [&](float16_t* data,
size_t size) {
513 output_channels_per_batch *
sizeof(float16_t),
514 "PRelu buffer size too small");
515 for (
int ob = 0; ob < output_channels_per_batch; ob++) {
516 data[ob] = prelu_scale_size == geometry.output_channels
517 ? prelu_scale[4 * os + ob]
523 std::vector<texture_attachment> input_attachments;
524 for (
int ib = 0; ib < input_batch_size; ib++) {
525 input_attachments.push_back(
526 {input_image->textures[is + ib], inputData[ib]});
528 for (
int ob = 0; ob < output_batch_size; ob++) {
529 input_attachments.push_back(
530 {output_image->textures[os + ob], previousData[ob]});
533 run(input_attachments,
534 {output_image->textures.begin() + os,
535 output_image->textures.begin() + os + output_batch_size},
538 outputSize->location,
539 output_image->texture_width,
540 output_image->texture_height);
541 glUniform2i(inputTileRange->location, 0, 1);
542 glUniform1i(accumulate->location, is != 0);
545 prelu_scale !=
nullptr &&
546 (is == input_slices - input_batch_size));
548 output_image->texture_width,
549 output_image->texture_height);
555 template <
typename T>
556 void GLConvolution::run_tiled_conv(
559 for (
int i = 0; i < input_images.size(); i++) {
562 int input_slices = input_image->slices;
563 int output_slices = output_image->slices;
564 int input_tile_x = input_image->tile_x;
565 int input_tile_y = input_image->tile_y;
566 int input_tiles = input_image->tile_x * input_image->tile_y;
567 int output_tiles = output_image->tile_x * output_image->tile_y;
569 for (
int ib = 0, it = 0; it < input_tiles;
570 ib++, it += input_tile_chunk_size) {
573 int binding_point = 0;
576 attach_uniform_buffer<float16_t>(
577 bias_block, binding_point++, [&](float16_t* data,
size_t size) {
580 geometry.output_channels *
sizeof(float16_t),
581 "Bias buffer size too small");
582 for (
int ob = 0; ob < geometry.output_channels; ob++) {
588 for (
int ob = 0, ot = 0; ot < output_tiles;
589 ob++, ot += output_tile_chunk_size) {
590 attach_uniform_buffer<float16_t>(
593 [&](float16_t* data,
size_t size) {
596 (4 * input_tile_chunk_size) * (4 * output_tile_chunk_size) *
597 geometry.kernel_size.y * geometry.kernel_size.x *
599 "Kernel size mismatch");
600 pack_kernel_data_for_tiled_conv(
603 input_image->channels,
604 output_image->channels,
605 {it, std::min(it + input_tile_chunk_size, input_tiles)},
606 {ot, std::min(ot + output_tile_chunk_size, output_tiles)});
611 if (prelu_scale !=
nullptr && ib == input_tile_batch_size - 1) {
612 attach_uniform_buffer<float16_t>(
615 [&](float16_t* data,
size_t size) {
618 geometry.output_channels *
sizeof(float16_t),
619 "PRelu buffer size too small");
620 for (
int ob = 0; ob < geometry.output_channels; ob++) {
621 data[ob] = prelu_scale_size == geometry.output_channels
628 std::vector<texture_attachment> input_attachments(
629 {{input_image->textures[0], inputData[0]},
630 {output_image->textures[0], previousData[0]}});
632 run(input_attachments,
633 {output_image->textures[0]},
636 outputSize->location,
637 output_image->texture_width,
638 output_image->texture_height);
641 inputTileRange->location,
643 std::min(it + input_tile_chunk_size, input_tiles));
645 glUniform1i(accumulate->location, it != 0);
648 prelu_scale !=
nullptr && (ib == input_tile_batch_size - 1));
650 output_image->texture_width,
651 output_image->texture_height);
658 template <
typename OPBase>
659 static void computeOutputHW(OPBase* op,
int H,
int W,
int* OH,
int* OW) {
661 input.Resize(1, 1, H, W);
662 op->SetOutputSize(input, &output, 1);
663 CAFFE_ENFORCE_EQ(output.ndim(), 4);
668 static int computeOutputTileChunkSize(
int output_tile_x,
672 static const int maxUniformBlockBufferSize = 16 * 1024;
674 output_tile_x * output_tile_y,
675 maxUniformBlockBufferSize / 4 /
676 (4 * kernel_width * kernel_height * (
int)
sizeof(float16_t)));
679 static int computeInputTileChunkSize(
682 int output_tile_chunk_size,
685 static const int maxUniformBlockBufferSize = 16 * 1024;
687 input_tile_x * input_tile_y,
688 maxUniformBlockBufferSize / 4 /
689 (4 * output_tile_chunk_size * kernel_width * kernel_height *
690 (
int)
sizeof(float16_t)));
695 static void computeBatchSizes(
697 int& input_batch_size,
698 int& output_batch_size) {
699 int kernel_size = std::max(geometry.kernel_size.x, geometry.kernel_size.y);
700 int input_slices = (geometry.input_channels + 3) / 4;
701 int output_slices = (geometry.output_channels + 3) / 4;
704 input_batch_size = input_slices % 2 == 0 ? 2 : 1;
705 output_batch_size = output_slices % 2 == 0 ? 2 : 1;
707 if (iPhoneVersion() >= 8) {
710 input_slices % 4 == 0
712 : input_slices % 3 == 0 ? 3 : input_slices % 2 == 0 ? 2 : 1;
713 output_batch_size = output_slices % 4 == 0
715 : output_slices % 3 == 0 ? 3 : output_slices % 2 == 0 ? 2 : 1;
720 template <
class T,
bool fusePRelu,
bool fuseRelu>
723 USE_OPERATOR_BASE_FUNCTIONS;
726 OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW,
"OpenGL only supports NCHW order.");
727 OPERATOR_NEEDS_FEATURE(group_ == 1,
"OpenGL only supports group == 1");
728 OPERATOR_NEEDS_FEATURE(
729 dilation_h() == 1 && dilation_w() == 1,
730 "OpenGL only supports dialation == 1");
733 bool RunOnDeviceWithOrderNCHW()
override {
734 const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
735 auto& filter = Input(FILTER);
736 auto& bias = Input(BIAS);
738 const int num_images = input.size();
739 const int input_channels = input.channels();
740 const int input_width = input.width();
741 const int input_height = input.height();
743 CAFFE_ENFORCE(filter.ndim(), 4);
744 const int M = filter.dim32(0);
745 const int kernel_width = filter.dim32(2);
746 const int kernel_height = filter.dim32(3);
748 CAFFE_ENFORCE(filter.dim32(1) == input_channels,
"");
749 CAFFE_ENFORCE(filter.dim32(2) == kernel_h(),
"");
750 CAFFE_ENFORCE(filter.dim32(3) == kernel_w(),
"");
751 CAFFE_ENFORCE(bias.ndim() == 1,
"");
752 CAFFE_ENFORCE(bias.dim32(0) == M,
"");
756 const int output_channels = M;
757 computeOutputHW(
this, input_height, input_width, &output_height, &output_width);
760 const float* prelu_scale =
nullptr;
761 int prelu_scale_size = 0;
763 auto& prelu = Input(PRELU);
764 prelu_scale = prelu.template data<float>();
765 prelu_scale_size = prelu.size();
766 }
else if (fuseRelu) {
768 prelu_scale_size = 1;
771 const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
772 int output_tile_x = 1, output_tile_y = 1;
773 int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
774 int input_tile_chunk_size = 1, output_tile_chunk_size = 1;
775 int input_tile_batch_size = 1, output_tile_batch_size = 1;
777 const bool tiling = GetSingleArgument<int>(
"tiling", input_tile_x > 1 || input_tile_y > 1);
781 CAFFE_ENFORCE_EQ(input.slices(), 1,
"Input needs to be tiled in a single texture");
782 computeOutputTiles(output_channels, output_tile_x, output_tile_y);
783 output_tiles = output_tile_x * output_tile_y;
785 output_tile_chunk_size = computeOutputTileChunkSize(
786 output_tile_x, output_tile_y, kernel_width, kernel_height);
787 output_tile_batch_size = std::max(
788 MaxOutputTileBatchSize,
789 (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
790 output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
791 output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
793 input_tile_chunk_size = computeInputTileChunkSize(
796 output_tile_chunk_size,
799 input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
803 CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
804 CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
805 CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
807 int is_last = GetSingleArgument<int>(
"is_last", 0);
821 {kernel_width, kernel_height},
822 {input_width, input_height},
823 {output_width, output_height},
824 {input_tile_x, input_tile_y},
825 {output_tile_x, output_tile_y},
827 {stride_w(), stride_h()},
831 int input_batch_size = 1, output_batch_size = 1;
833 computeBatchSizes(geometry, input_batch_size, output_batch_size);
835 GetSingleArgument<int>(
"input_batch_size", input_batch_size);
836 output_batch_size = GetSingleArgument<int>(
"output_batch_size", output_batch_size);
839 LOG(INFO) << input_channels <<
": " << input_height <<
" X " 840 << input_width <<
" => " << output_channels <<
": " 841 << output_height <<
" X " << output_width
842 <<
" Kernel: " << kernel_width <<
"X" << kernel_height;
844 LOG(INFO) <<
"Tiling: " << input_tile_x <<
" X " << input_tile_y
845 <<
" => " << output_tile_x <<
" X " << output_tile_y
846 <<
", Texture size: " << input_width * input_tile_x <<
" X " 847 << input_height * input_tile_y <<
" => " 848 << output_width * output_tile_x <<
" X " 849 << output_height * output_tile_y
850 <<
", Input tile batch size: " << input_tile_batch_size;
852 LOG(INFO) <<
"input_batch_size = " << input_batch_size
853 <<
", output_batch_size = " << output_batch_size;
857 filter.template data<float>(),
858 bias.template data<float>(),
865 input_tile_chunk_size,
866 output_tile_chunk_size,
867 input_tile_batch_size,
868 output_tile_batch_size,
872 conv->convolution(input, *output);
874 Outputs()[0]->Reset(output);
880 std::unique_ptr<GLConvolution> conv;
882 INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
886 OPERATOR_SCHEMA(OpenGLConv).NumInputs(3).NumOutputs(1);
889 OPERATOR_SCHEMA(OpenGLConvPRelu).NumInputs(4).NumOutputs(1);
892 OPERATOR_SCHEMA(OpenGLConvRelu).NumInputs(3).NumOutputs(1);
894 template <
class T,
bool fusePRelu,
bool fuseRelu>
897 USE_OPERATOR_BASE_FUNCTIONS;
900 OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW,
"OpenGL only supports NCHW order.");
901 OPERATOR_NEEDS_FEATURE(
902 adj_h() == 0 && adj_w() == 0,
903 "OpenGL only supports adj_h == 1 and adj_w == 1");
906 bool RunOnDeviceWithOrderNCHW()
override {
907 const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
908 auto& filter = Input(FILTER);
909 auto& bias = Input(BIAS);
911 const int num_images = input.size();
912 const int input_channels = input.channels();
913 const int input_width = input.width();
914 const int input_height = input.height();
916 CAFFE_ENFORCE(filter.ndim() == 4,
"filter must be 4D tensor");
917 const int M = filter.dim32(0);
918 const int C = filter.dim32(1);
919 const int kernel_width = filter.dim32(2);
920 const int kernel_height = filter.dim32(3);
922 CAFFE_ENFORCE(input_channels == M,
"filter number must be equal to input channel number");
923 CAFFE_ENFORCE(filter.dim32(2) == kernel_h(),
"filter height must be equal to kernel height");
924 CAFFE_ENFORCE(filter.dim32(3) == kernel_w(),
"filter width must be equal to kernel width");
925 CAFFE_ENFORCE(bias.ndim() == 1,
"bias must be 1D tensor");
926 CAFFE_ENFORCE(bias.dim32(0) == C,
"bias dimension must be equal to output channel number");
930 const int output_channels = C;
931 computeOutputHW(
this, input_height, input_width, &output_height, &output_width);
934 const float* prelu_scale =
nullptr;
935 int prelu_scale_size = 0;
937 auto& prelu = Input(PRELU);
938 prelu_scale = prelu.template data<float>();
939 prelu_scale_size = prelu.size();
940 }
else if (fuseRelu) {
942 prelu_scale_size = 1;
945 const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
946 int output_tile_x = 1, output_tile_y = 1;
947 int input_tiles = input_tile_x * input_tile_y, output_tiles = 1;
948 int input_tile_chunk_size = 1, output_tile_chunk_size = 1,
949 input_tile_batch_size = 1, output_tile_batch_size = 1;
951 const bool tiling = GetSingleArgument<int>(
"tiling", input_tile_x > 1 || input_tile_y > 1);
955 CAFFE_ENFORCE_EQ(input.slices(), 1,
"Input needs to be tiled in a single texture");
956 computeOutputTiles(output_channels, output_tile_x, output_tile_y);
957 output_tiles = output_tile_x * output_tile_y;
959 output_tile_chunk_size = computeOutputTileChunkSize(
960 output_tile_x, output_tile_y, kernel_width, kernel_height);
961 output_tile_batch_size = std::max(
962 MaxOutputTileBatchSize,
963 (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size);
964 output_tile_chunk_size = (output_tiles + output_tile_batch_size - 1) / output_tile_batch_size;
965 output_tile_batch_size = (output_tiles + output_tile_chunk_size - 1) / output_tile_chunk_size;
967 input_tile_chunk_size = computeInputTileChunkSize(
970 output_tile_chunk_size,
973 input_tile_batch_size = (input_tiles + input_tile_chunk_size - 1) / input_tile_chunk_size;
977 CAFFE_ENFORCE_GT(input_tile_chunk_size, 0);
978 CAFFE_ENFORCE_GT(output_tile_chunk_size, 0);
979 CAFFE_ENFORCE_LE(output_tile_batch_size, 8);
981 int is_last = GetSingleArgument<int>(
"is_last", 0);
995 {kernel_width, kernel_height},
996 {input_width, input_height},
997 {output_width, output_height},
998 {input_tile_x, input_tile_y},
999 {output_tile_x, output_tile_y},
1001 {stride_w(), stride_h()},
1005 int input_batch_size = 1, output_batch_size = 1;
1007 computeBatchSizes(geometry, input_batch_size, output_batch_size);
1009 GetSingleArgument<int>(
"input_batch_size", input_batch_size);
1010 output_batch_size = GetSingleArgument<int>(
"output_batch_size", output_batch_size);
1013 LOG(INFO) << input_channels <<
": " << input_height <<
" X " 1014 << input_width <<
" => " << output_channels <<
": " 1015 << output_height <<
" X " << output_width
1016 <<
" Kernel: " << kernel_width <<
"X" << kernel_height;
1019 LOG(INFO) <<
"Tiling: " << input_tile_x <<
" X " << input_tile_y
1020 <<
" => " << output_tile_x <<
" X " << output_tile_y
1021 <<
", Texture size: " << input_width * input_tile_x <<
" X " 1022 << input_height * input_tile_y <<
" => " 1023 << output_width * output_tile_x <<
" X " 1024 << output_height * output_tile_y
1025 <<
", Input tile batch size: " << input_tile_batch_size;
1027 LOG(INFO) <<
"input_batch_size = " << input_batch_size
1028 <<
", output_batch_size = " << output_batch_size;
1032 filter.template data<float>(),
1033 bias.template data<float>(),
1038 input.tile_x() * input.tile_y(),
1039 output->tile_x() * output->tile_y(),
1040 input_tile_chunk_size,
1041 output_tile_chunk_size,
1042 input_tile_batch_size,
1043 output_tile_batch_size,
1047 conv->convolution(input, *output);
1049 Outputs()[0]->Reset(output);
1055 std::unique_ptr<GLConvolution> conv;
1057 INPUT_TAGS(INPUT, FILTER, BIAS, PRELU);
1061 OPERATOR_SCHEMA(OpenGLConvTranspose).NumInputs(3).NumOutputs(1);
1064 OPERATOR_SCHEMA(OpenGLConvTransposePRelu).NumInputs(4).NumOutputs(1);
1067 OPERATOR_SCHEMA(OpenGLConvTransposeRelu).NumInputs(3).NumOutputs(1);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...