2 #include "../core/GLFilter.h" 3 #include "../core/GLImage.h" 5 #include "caffe2/core/timer.h" 21 const std::vector<binding*> input_bindings() {
22 std::vector<binding*> bindings({BINDING(inputTileSize),
24 BINDING(outputTileSize),
26 BINDING(spatialTileSize),
27 BINDING(inputTileRange),
35 bool compute_sum_ =
false,
45 {{
"COMPUTE_SUM", caffe2::to_string((
int)compute_sum_)},
46 {
"INPUT_TILE_X", caffe2::to_string(input_tile_x)},
47 {
"TILED_SOFTMAX", caffe2::to_string(
int(tiled))}}) {}
55 static const char* fragment_shader;
60 const char* GLSoftmaxReduce::fragment_shader = R
"GLSL(#version 300 es 62 #define TILED_SOFTMAX $(TILED_SOFTMAX) 63 #define INPUT_TILE_X $(INPUT_TILE_X) 65 #define COMPUTE_SUM $(COMPUTE_SUM) 67 precision highp float; 68 precision mediump int; 70 in highp vec2 v_texCoord; 72 uniform ivec2 inputTileSize; 73 uniform ivec2 outputSize; 74 uniform ivec2 outputTileSize; 75 uniform ivec2 spatialTileSize; 76 uniform ivec2 tileSize; 77 uniform ivec2 inputTileRange; 79 TEXTURE_INPUT(inputData); 80 TEXTURE_OUTPUT(0, outputData); 84 ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)); 85 ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx 86 ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates 87 ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize); 89 vec4 result = vec4(0.0); 90 for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) { 91 int inTileX = tileIdx % INPUT_TILE_X; 92 int inTileY = tileIdx / INPUT_TILE_X; 93 ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize; 94 for (int y = 0; y < sumArea.y; y++) { 95 for (int x = 0; x < sumArea.x; x++) { 96 ivec2 idx = tileCoord + ivec2(x, y); 97 vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx); 101 result = max(result, val); 107 outputData = TEXTURE_STORE(result); 111 ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize)); 112 ivec2 texelCoord = outputCoord * spatialTileSize; 113 ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord); 114 vec4 result = vec4(0.0); 116 for (int y = 0; y < sumArea.y; y++) { 117 for (int x = 0; x < sumArea.x; x++) { 118 ivec2 idx = texelCoord + ivec2(x, y); 119 vec4 val = TEXTURE_LOAD(inputData, idx); 123 result = max(result, val); 128 outputData = TEXTURE_STORE(result); 133 template <
typename T>
134 void GLSoftmaxReduce::reduce(
const GLImage<T>* input_image,
138 int input_slices = input_image->slices;
139 int output_slices = output_image->slices;
141 for (
int is = 0; is < input_slices; is++) {
142 std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
143 run(input_attachments,
144 {output_image->textures.begin() + is,
145 output_image->textures.begin() + is + 1},
148 inputTileSize->location, input_image->width, input_image->height);
150 outputSize->location,
151 output_image->texture_width,
152 output_image->texture_height);
154 outputTileSize->location,
156 output_image->height);
158 tileSize->location, input_image->tile_x, input_image->tile_y);
159 glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
161 inputTileRange->location,
164 (input_image->channels + 3) / 4,
165 input_image->tile_x * input_image->tile_y));
167 output_image->texture_width,
168 output_image->texture_height);
179 const std::vector<binding*> input_bindings() {
180 std::vector<binding*> bindings(
181 {BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
193 {{
"COMPUTE_EXP", caffe2::to_string((
int)_compute_exp)},
194 {
"TILED_SOFTMAX", caffe2::to_string((
int)tiled)}}) {}
196 template <
typename T>
202 static const char* fragment_shader;
205 template <
typename T>
206 void GLSoftmaxScale::scale(
const GLImage<T>* input_image,
210 int input_slices = input_image->slices;
211 int output_slices = output_image->slices;
213 for (
int is = 0; is < input_slices; is++) {
214 std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
215 {max_image->textures[is], maxData},
216 {sum_image->textures[is], sumData}});
217 run(input_attachments,
218 {output_image->textures.begin() + is,
219 output_image->textures.begin() + is + 1},
222 outputSize->location,
223 output_image->texture_width,
224 output_image->texture_height);
226 output_image->texture_width,
227 output_image->texture_height);
233 const char* GLSoftmaxScale::fragment_shader = R
"GLSL(#version 300 es 235 #define COMPUTE_EXP $(COMPUTE_EXP) 236 #define TILED_SOFTMAX $(TILED_SOFTMAX) 238 precision highp float; 239 precision mediump int; 241 in highp vec2 v_texCoord; 242 uniform ivec2 outputSize; 244 TEXTURE_INPUT(inputData); 245 TEXTURE_INPUT(maxData); 246 TEXTURE_INPUT(sumData); 247 TEXTURE_OUTPUT(0, outputData); 250 ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)); 251 vec4 val = TEXTURE_LOAD(inputData, texelCoord); 253 vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0)); 255 float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w); 256 maxVal = vec4(singleMax, singleMax, singleMax, singleMax); 257 outputData = TEXTURE_STORE(exp(val - maxVal)); 259 outputData = TEXTURE_STORE(exp(val - maxVal)); 263 vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0)); 265 float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w; 266 sumVal = vec4(singleSum, singleSum, singleSum, singleSum); 267 outputData = TEXTURE_STORE(val / sumVal); 269 outputData = TEXTURE_STORE(val / sumVal); 276 #include "../core/ImageAllocator.h" 277 #include "caffe2/core/operator.h" 279 #ifndef CAFFE2_MOBILE 280 #error "Caffe2 mobile state not defined" 287 class OpenGLSoftmax final :
public Operator<CPUContext>, ImageAllocator<T> {
289 OpenGLSoftmax(
const OperatorDef& operator_def, Workspace* ws)
290 : Operator<CPUContext>(operator_def, ws),
291 order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>(
"order",
"NCHW"))) {
292 OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW,
"OpenGL only supports NCHW order.");
295 bool RunOnDevice()
override {
296 const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
297 const int num_images = input.size();
298 const int input_channels = input.channels();
299 const int input_width = input.width();
300 const int input_height = input.height();
302 const int output_channels = input_channels;
303 const int output_width = input_width;
304 const int output_height = input_height;
306 int is_last = OperatorBase::GetSingleArgument<int>(
"is_last", 0);
308 const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
309 const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
310 const bool tiled = input_tile_x > 1 || input_tile_y > 1;
313 input.slices(), 1,
"Input needs to be tiled in a single texture");
317 tiled || input_channels == 1,
318 "Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
321 const int tile_size_x = 16;
322 const int tile_size_y = 16;
324 int max_buf_width = input_width;
325 int max_buf_height = input_height;
326 int max_buf_channels = input_channels;
327 vector<GLImageVector<T>*> reduce_buf;
329 while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
330 max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
331 max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
335 (max_buf_channels + input_tile_x * input_tile_y - 1) /
336 (input_tile_x + input_tile_y);
338 reduce_buf.push_back(ImageAllocator<T>::newImage(
372 for (
int i = 0; i < num_images; i++) {
373 auto input_image = input[i];
374 auto max_image = (*max)[i];
375 auto sum_image = (*sum)[i];
376 auto after_exp_image = (*after_exp)[i];
377 auto output_image = (*output_images)[i];
379 for (
int ir = 0; ir < reduce_buf.size() + 1; ir++) {
380 const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
381 GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
383 const int running_tile_size_x =
384 ir < reduce_buf.size() ? tile_size_x : in->width;
385 const int running_tile_size_y =
386 ir < reduce_buf.size() ? tile_size_y : in->height;
387 f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
390 f_exp->scale(input_image, max_image, sum_image, after_exp_image);
393 for (
int ir = 0; ir < reduce_buf.size() + 1; ir++) {
394 const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
395 GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
396 const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
397 const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
398 f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
402 f_scale->scale(after_exp_image, max_image, sum_image, output_image);
405 Outputs()[OUTPUT]->Reset(output_images);
410 for (
auto&& rb : reduce_buf) {
418 std::unique_ptr<GLSoftmaxReduce> f_max;
419 std::unique_ptr<GLSoftmaxScale> f_exp;
420 std::unique_ptr<GLSoftmaxReduce> f_sum;
421 std::unique_ptr<GLSoftmaxScale> f_scale;
423 INPUT_TAGS(INPUT, FILTER, BIAS);
427 REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
428 OPERATOR_SCHEMA(OpenGLSoftmax)
431 .AllowInplace({{0, 0}})
432 .IdenticalTypeAndShape();
434 #endif // CAFFE2_MOBILE
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...