Caffe2 - C++ API
A deep learning, cross platform ML framework
GLSoftmax.cc
1 
2 #include "../core/GLFilter.h"
3 #include "../core/GLImage.h"
4 
5 #include "caffe2/core/timer.h"
6 #include <iostream>
7 #include <vector>
8 
9 class GLSoftmaxReduce : public GLFilter {
10  public:
11  binding* inputTileSize;
12  binding* outputSize;
13  binding* outputTileSize;
14  binding* tileSize;
15  binding* spatialTileSize;
16  binding* inputTileRange;
17  binding* inputData;
18  binding* maxData;
19  binding* sumData;
20 
21  const std::vector<binding*> input_bindings() {
22  std::vector<binding*> bindings({BINDING(inputTileSize),
23  BINDING(outputSize),
24  BINDING(outputTileSize),
25  BINDING(tileSize),
26  BINDING(spatialTileSize),
27  BINDING(inputTileRange),
28  BINDING(inputData),
29  BINDING(maxData),
30  BINDING(sumData)});
31  return bindings;
32  }
33 
35  bool compute_sum_ = false,
36  bool tiled = false,
37  int input_tile_x = 1)
38  : GLFilter(
39  "GLSoftmaxReduce",
40  vertex_shader,
41  fragment_shader,
42  input_bindings(),
43  {/* no uniform_blocks_bindings */},
44  {/* no attributes */},
45  {{"COMPUTE_SUM", caffe2::to_string((int)compute_sum_)},
46  {"INPUT_TILE_X", caffe2::to_string(input_tile_x)},
47  {"TILED_SOFTMAX", caffe2::to_string(int(tiled))}}) {}
48 
49  template <typename T>
50  void reduce(const GLImage<T>* input_image,
51  const GLImage<T>* output_image,
52  int tile_size_x,
53  int tile_size_y);
54 
55  static const char* fragment_shader;
56 };
57 
58 // MARK: GLSL
59 
60 const char* GLSoftmaxReduce::fragment_shader = R"GLSL(#version 300 es
61 
62 #define TILED_SOFTMAX $(TILED_SOFTMAX)
63 #define INPUT_TILE_X $(INPUT_TILE_X)
64 // Compute sum or max
65 #define COMPUTE_SUM $(COMPUTE_SUM)
66 
67 precision highp float;
68 precision mediump int;
69 
70 in highp vec2 v_texCoord;
71 
72 uniform ivec2 inputTileSize;
73 uniform ivec2 outputSize;
74 uniform ivec2 outputTileSize;
75 uniform ivec2 spatialTileSize;
76 uniform ivec2 tileSize;
77 uniform ivec2 inputTileRange;
78 
79 TEXTURE_INPUT(inputData);
80 TEXTURE_OUTPUT(0, outputData);
81 
82 #if TILED_SOFTMAX
83 void main() {
84  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
85  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
86  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
87  ivec2 sumArea = min(spatialTileSize, inputTileSize - tileCoord * spatialTileSize);
88 
89  vec4 result = vec4(0.0);
90  for (int tileIdx = inputTileRange.x; tileIdx < inputTileRange.y; tileIdx++) {
91  int inTileX = tileIdx % INPUT_TILE_X;
92  int inTileY = tileIdx / INPUT_TILE_X;
93  ivec2 inputTileOffset = ivec2(inTileX, inTileY) * inputTileSize;
94  for (int y = 0; y < sumArea.y; y++) {
95  for (int x = 0; x < sumArea.x; x++) {
96  ivec2 idx = tileCoord + ivec2(x, y);
97  vec4 val = TEXTURE_LOAD(inputData, inputTileOffset + idx);
98  #if COMPUTE_SUM
99  result += val;
100  #else
101  result = max(result, val);
102  #endif
103  }
104  }
105  }
106 
107  outputData = TEXTURE_STORE(result);
108 }
109 #else
110 void main() {
111  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputTileSize));
112  ivec2 texelCoord = outputCoord * spatialTileSize;
113  ivec2 sumArea = min(spatialTileSize, inputTileSize - texelCoord);
114  vec4 result = vec4(0.0);
115 
116  for (int y = 0; y < sumArea.y; y++) {
117  for (int x = 0; x < sumArea.x; x++) {
118  ivec2 idx = texelCoord + ivec2(x, y);
119  vec4 val = TEXTURE_LOAD(inputData, idx);
120 #if COMPUTE_SUM
121  result += val;
122 #else
123  result = max(result, val);
124 #endif
125  }
126  }
127 
128  outputData = TEXTURE_STORE(result);
129 }
130 #endif
131 )GLSL";
132 
133 template <typename T>
134 void GLSoftmaxReduce::reduce(const GLImage<T>* input_image,
135  const GLImage<T>* output_image,
136  int tile_size_x,
137  int tile_size_y) {
138  int input_slices = input_image->slices;
139  int output_slices = output_image->slices;
140 
141  for (int is = 0; is < input_slices; is++) {
142  std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
143  run(input_attachments,
144  {output_image->textures.begin() + is,
145  output_image->textures.begin() + is + 1},
146  [&]() {
147  glUniform2i(
148  inputTileSize->location, input_image->width, input_image->height);
149  glUniform2i(
150  outputSize->location,
151  output_image->texture_width,
152  output_image->texture_height);
153  glUniform2i(
154  outputTileSize->location,
155  output_image->width,
156  output_image->height);
157  glUniform2i(
158  tileSize->location, input_image->tile_x, input_image->tile_y);
159  glUniform2i(spatialTileSize->location, tile_size_x, tile_size_y);
160  glUniform2i(
161  inputTileRange->location,
162  0,
163  std::min(
164  (input_image->channels + 3) / 4,
165  input_image->tile_x * input_image->tile_y));
166  },
167  output_image->texture_width,
168  output_image->texture_height);
169  }
170 }
171 
172 class GLSoftmaxScale : public GLFilter {
173  public:
174  binding* outputSize;
175  binding* inputData;
176  binding* maxData;
177  binding* sumData;
178 
179  const std::vector<binding*> input_bindings() {
180  std::vector<binding*> bindings(
181  {BINDING(outputSize), BINDING(inputData), BINDING(maxData), BINDING(sumData)});
182  return bindings;
183  }
184 
185  GLSoftmaxScale(bool _compute_exp = false, bool tiled = false)
186  : GLFilter(
187  "GLSoftmaxScale",
188  vertex_shader,
189  fragment_shader,
190  input_bindings(),
191  {/* no uniform blocks */},
192  {/* no attributes */},
193  {{"COMPUTE_EXP", caffe2::to_string((int)_compute_exp)},
194  {"TILED_SOFTMAX", caffe2::to_string((int)tiled)}}) {}
195 
196  template <typename T>
197  void scale(const GLImage<T>* input_image,
198  const GLImage<T>* max_image,
199  const GLImage<T>* sum_image,
200  const GLImage<T>* output_image);
201 
202  static const char* fragment_shader;
203 };
204 
205 template <typename T>
206 void GLSoftmaxScale::scale(const GLImage<T>* input_image,
207  const GLImage<T>* max_image,
208  const GLImage<T>* sum_image,
209  const GLImage<T>* output_image) {
210  int input_slices = input_image->slices;
211  int output_slices = output_image->slices;
212 
213  for (int is = 0; is < input_slices; is++) {
214  std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
215  {max_image->textures[is], maxData},
216  {sum_image->textures[is], sumData}});
217  run(input_attachments,
218  {output_image->textures.begin() + is,
219  output_image->textures.begin() + is + 1},
220  [&]() {
221  glUniform2i(
222  outputSize->location,
223  output_image->texture_width,
224  output_image->texture_height);
225  },
226  output_image->texture_width,
227  output_image->texture_height);
228  }
229 }
230 
231 // MARK: GLSL
232 
233 const char* GLSoftmaxScale::fragment_shader = R"GLSL(#version 300 es
234 
235 #define COMPUTE_EXP $(COMPUTE_EXP)
236 #define TILED_SOFTMAX $(TILED_SOFTMAX)
237 
238 precision highp float;
239 precision mediump int;
240 
241 in highp vec2 v_texCoord;
242 uniform ivec2 outputSize;
243 
244 TEXTURE_INPUT(inputData);
245 TEXTURE_INPUT(maxData);
246 TEXTURE_INPUT(sumData);
247 TEXTURE_OUTPUT(0, outputData);
248 
249 void main() {
250  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
251  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
252 #if COMPUTE_EXP
253  vec4 maxVal = TEXTURE_LOAD(maxData, ivec2(0));
254  #if TILED_SOFTMAX
255  float singleMax = max(max(max(maxVal.x, maxVal.y), maxVal.z), maxVal.w);
256  maxVal = vec4(singleMax, singleMax, singleMax, singleMax);
257  outputData = TEXTURE_STORE(exp(val - maxVal));
258  #else
259  outputData = TEXTURE_STORE(exp(val - maxVal));
260  #endif
261 
262 #else
263  vec4 sumVal = TEXTURE_LOAD(sumData, ivec2(0));
264  #if TILED_SOFTMAX
265  float singleSum = sumVal.x + sumVal.y + sumVal.z + sumVal.w;
266  sumVal = vec4(singleSum, singleSum, singleSum, singleSum);
267  outputData = TEXTURE_STORE(val / sumVal);
268  #else
269  outputData = TEXTURE_STORE(val / sumVal);
270  #endif
271 #endif
272 
273 }
274 )GLSL";
275 
276 #include "../core/ImageAllocator.h"
277 #include "caffe2/core/operator.h"
278 
279 #ifndef CAFFE2_MOBILE
280 #error "Caffe2 mobile state not defined"
281 #endif
282 
283 #if CAFFE2_MOBILE
284 
285 namespace caffe2 {
286 template <class T>
287 class OpenGLSoftmax final : public Operator<CPUContext>, ImageAllocator<T> {
288  public:
289  OpenGLSoftmax(const OperatorDef& operator_def, Workspace* ws)
290  : Operator<CPUContext>(operator_def, ws),
291  order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
292  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
293  }
294 
295  bool RunOnDevice() override {
296  const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
297  const int num_images = input.size();
298  const int input_channels = input.channels();
299  const int input_width = input.width();
300  const int input_height = input.height();
301 
302  const int output_channels = input_channels;
303  const int output_width = input_width;
304  const int output_height = input_height;
305 
306  int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
307  // For tiling
308  const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
309  const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
310  const bool tiled = input_tile_x > 1 || input_tile_y > 1;
311  if (tiled) {
312  CAFFE_ENFORCE_EQ(
313  input.slices(), 1, "Input needs to be tiled in a single texture");
314  }
315 
316  CAFFE_ENFORCE(
317  tiled || input_channels == 1,
318  "Softmax only works for input_channel == 1 or input_channel > 1 with tiling enabled.");
319 
320  // for spatial dimension
321  const int tile_size_x = 16;
322  const int tile_size_y = 16;
323 
324  int max_buf_width = input_width;
325  int max_buf_height = input_height;
326  int max_buf_channels = input_channels;
327  vector<GLImageVector<T>*> reduce_buf;
328 
329  while (reduce_buf.size() == 0 || (max_buf_height > tile_size_y)) {
330  max_buf_width = (max_buf_width + tile_size_x - 1) / tile_size_x;
331  max_buf_height = (max_buf_height + tile_size_y - 1) / tile_size_y;
332  if (tiled) {
333  // since we are summing over all the channels within a channel tile
334  max_buf_channels =
335  (max_buf_channels + input_tile_x * input_tile_y - 1) /
336  (input_tile_x + input_tile_y);
337  }
338  reduce_buf.push_back(ImageAllocator<T>::newImage(
339  1,
340  max_buf_width,
341  max_buf_height,
342  max_buf_channels,
343  output_tile_x,
344  output_tile_y));
345  }
346 
347  GLImageVector<T>* max = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
348  GLImageVector<T>* sum = ImageAllocator<T>::newImage(num_images, 1, 1, 1);
349  GLImageVector<T>* after_exp = ImageAllocator<T>::newImage(
350  num_images,
351  output_width,
352  output_height,
353  output_channels,
354  output_tile_x,
355  output_tile_y);
356  GLImageVector<T>* output_images = ImageAllocator<T>::newImage(
357  num_images,
358  output_width,
359  output_height,
360  output_channels,
361  output_tile_x,
362  output_tile_y,
363  is_last);
364 
365  if (!f_max) {
366  f_max.reset(new GLSoftmaxReduce(false, tiled, input_tile_x));
367  f_exp.reset(new GLSoftmaxScale(true, tiled));
368  f_sum.reset(new GLSoftmaxReduce(true, tiled, input_tile_x));
369  f_scale.reset(new GLSoftmaxScale(false, tiled));
370  }
371 
372  for (int i = 0; i < num_images; i++) {
373  auto input_image = input[i];
374  auto max_image = (*max)[i];
375  auto sum_image = (*sum)[i];
376  auto after_exp_image = (*after_exp)[i];
377  auto output_image = (*output_images)[i];
378  // Get Max
379  for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
380  const GLImage<T>* in = ir == 0 ? input_image : (*reduce_buf[ir - 1])[0];
381  GLImage<T>* out = ir == reduce_buf.size() ? max_image : (*reduce_buf[ir])[0];
382 
383  const int running_tile_size_x =
384  ir < reduce_buf.size() ? tile_size_x : in->width;
385  const int running_tile_size_y =
386  ir < reduce_buf.size() ? tile_size_y : in->height;
387  f_max->reduce(in, out, running_tile_size_x, running_tile_size_y);
388  }
389  // scale vals by exp(x - max)
390  f_exp->scale(input_image, max_image, sum_image, after_exp_image);
391 
392  // Get sum of the exp
393  for (int ir = 0; ir < reduce_buf.size() + 1; ir++) {
394  const GLImage<T>* in = ir == 0 ? after_exp_image : (*reduce_buf[ir - 1])[0];
395  GLImage<T>* out = ir == reduce_buf.size() ? sum_image : (*reduce_buf[ir])[0];
396  const int running_tile_size_x = ir < reduce_buf.size() ? tile_size_x : in->width;
397  const int running_tile_size_y = ir < reduce_buf.size() ? tile_size_y : in->height;
398  f_sum->reduce(in, out, running_tile_size_x, running_tile_size_y);
399  }
400 
401  // Scale(softmax)
402  f_scale->scale(after_exp_image, max_image, sum_image, output_image);
403  }
404 
405  Outputs()[OUTPUT]->Reset(output_images);
406 
407  delete sum;
408  delete max;
409  delete after_exp;
410  for (auto&& rb : reduce_buf) {
411  delete rb;
412  }
413  return true;
414  }
415 
416  private:
417  StorageOrder order_;
418  std::unique_ptr<GLSoftmaxReduce> f_max;
419  std::unique_ptr<GLSoftmaxScale> f_exp;
420  std::unique_ptr<GLSoftmaxReduce> f_sum;
421  std::unique_ptr<GLSoftmaxScale> f_scale;
422 
423  INPUT_TAGS(INPUT, FILTER, BIAS);
424  OUTPUT_TAGS(OUTPUT);
425 };
426 
427 REGISTER_CPU_OPERATOR(OpenGLSoftmax, OpenGLSoftmax<float16_t>);
428 OPERATOR_SCHEMA(OpenGLSoftmax)
429  .NumInputs(1)
430  .NumOutputs(1)
431  .AllowInplace({{0, 0}})
432  .IdenticalTypeAndShape();
433 } // namespace caffe2
434 #endif // CAFFE2_MOBILE
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...