Caffe2 - C++ API
A deep learning, cross platform ML framework
GLPool.cc
1 
2 #include "../core/GLFilter.h"
3 #include "../core/GLImage.h"
4 #include "../core/ImageAllocator.h"
5 
6 #include "caffe2/core/timer.h"
7 #include "caffe2/operators/pool_op.h"
8 
9 class GLPool : public GLFilter {
10  public:
11  typedef enum { AveragePool, MaxPool } PoolType;
12 
13  struct point {
14  int x;
15  int y;
16  };
17 
18  struct descriptor {
19  int channels;
20  point kernel_size;
21  point input_padding;
22  point input_stride;
23  point input_tile_size;
24  point output_tile_size;
25  };
26 
27  binding* inputData;
28  binding* kernelSize;
29  binding* outputSize;
30 
31  const descriptor geometry;
32 
33  GLPool(const descriptor& _geometry, PoolType poolType, bool _tiling)
34  : GLFilter(
35  "GLPool",
36  vertex_shader,
37  fragment_shader,
38  {
39  BINDING(inputData),
40  BINDING(kernelSize),
41  BINDING(outputSize),
42  },
43  {/* no uniform blocks */},
44  {/* no attributes */},
45  {{"KERNEL_SIZE_X", caffe2::to_string(_geometry.kernel_size.x)},
46  {"KERNEL_SIZE_Y", caffe2::to_string(_geometry.kernel_size.y)},
47  {"INPUT_PADDING_X", caffe2::to_string(_geometry.input_padding.x)},
48  {"INPUT_PADDING_Y", caffe2::to_string(_geometry.input_padding.y)},
49  {"INPUT_STRIDE_X", caffe2::to_string(_geometry.input_stride.x)},
50  {"INPUT_STRIDE_Y", caffe2::to_string(_geometry.input_stride.y)},
51  {"INPUT_TILE_WIDTH",
52  caffe2::to_string(_geometry.input_tile_size.x)},
53  {"INPUT_TILE_HEIGHT",
54  caffe2::to_string(_geometry.input_tile_size.y)},
55  {"OUTPUT_TILE_WIDTH",
56  caffe2::to_string(_geometry.output_tile_size.x)},
57  {"OUTPUT_TILE_HEIGHT",
58  caffe2::to_string(_geometry.output_tile_size.y)},
59  {"TILED_POOLING", caffe2::to_string(_tiling)},
60  {"MAX_POOL", caffe2::to_string(poolType == MaxPool)},
61  {"BOUNDS_CHECK_MODE", caffe2::to_string(1)}}),
62  geometry(_geometry) {}
63  ~GLPool() {}
64 
65  void pool(const GLImageVector<float16_t>& input_images,
66  const GLImageVector<float16_t>& output_images) {
67  for (int i = 0; i < input_images.size(); i++) {
68  auto input_image = input_images[i];
69  auto output_image = output_images[i];
70  int input_slices = input_image->slices;
71  int output_slices = output_image->slices;
72 
73  for (int is = 0; is < input_slices; is++) {
74  run({{input_image->textures[is], inputData}},
75  {output_image->textures[is]},
76  [&]() {
77  glUniform2i(outputSize->location, output_image->texture_width, output_image->texture_height);
78  glUniform2i(kernelSize->location, geometry.kernel_size.x, geometry.kernel_size.y);
79  },
80  output_image->texture_width,
81  output_image->texture_height);
82  }
83  }
84  }
85 
86  private:
87  /*
88  * Computes BOUNDS_CHECK_MODE for the convolution parameters.
89  *
90  * @retval 0 if bounds check can be skipped
91  * @retval non-zero if bounds check can not be skipped
92  */
93  inline static int bounds_check_mode(bool tiling, const descriptor& geometry) {
94  if (tiling) {
95  return 1;
96  }
97 
98  if (GLContext::getGLContext()->GL_EXT_texture_border_clamp_defined() ||
99  (geometry.input_padding.x == 0 && geometry.input_padding.y == 0)) {
100  return 0;
101  } else {
102  return 1;
103  }
104  }
105 
106  static const char* fragment_shader;
107 };
108 
109 // MARK: GLSL
110 const char* GLPool::fragment_shader = R"GLSL(#version 300 es
111 #define TILED_POOLING $(TILED_POOLING)
112 #define MAX_POOL $(MAX_POOL)
113 
114 // tiling
115 #define INPUT_TILE_WIDTH $(INPUT_TILE_WIDTH)
116 #define INPUT_TILE_HEIGHT $(INPUT_TILE_HEIGHT)
117 #define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
118 #define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
119 
120 #define BOUNDS_CHECK_MODE $(BOUNDS_CHECK_MODE)
121 
122 precision mediump float;
123 precision mediump int;
124 
125 in highp vec2 v_texCoord;
126 
127 const ivec2 input_padding = ivec2($(INPUT_PADDING_X), $(INPUT_PADDING_Y));
128 const ivec2 input_stride = ivec2($(INPUT_STRIDE_X), $(INPUT_STRIDE_Y));
129 const ivec2 kernel_size = ivec2($(KERNEL_SIZE_X), $(KERNEL_SIZE_Y));
130 
131 uniform ivec2 kernelSize;
132 uniform ivec2 outputSize;
133 
134 TEXTURE_INPUT(inputData);
135 TEXTURE_OUTPUT(0, outputData);
136 
137 #if BOUNDS_CHECK_MODE == 0
138  #define IN_BOUNDS(p, p0, p1) (true)
139 #else
140  #define IN_BOUNDS(p, p0, p1) (all(greaterThanEqual(p, p0)) && all(lessThan(p, p1)))
141 #endif
142 
143 // MIN_FLOAT is -2^14, which is the minimum precision requirement for mediump in OpenGL ES 3.0
144 const float MIN_FLOAT = -exp2(14.0);
145 
146 #if TILED_POOLING
147 
148 const ivec2 inputTileSize = ivec2(INPUT_TILE_WIDTH, INPUT_TILE_HEIGHT);
149 const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
150 
151 // tiled pooling
152 #if MAX_POOL
153 
154 #define POOL { \
155  pool = vec4(MIN_FLOAT); \
156  for (int y = 0; y < kernelSize.y; y++) { \
157  for (int x = 0; x < kernelSize.x; x++) { \
158  ivec2 idx = tileCoord + ivec2(x, y); \
159  if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
160  vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
161  pool = max(pool, data); \
162  } \
163  } \
164  } \
165 }
166 
167 #else
168 
169 #define POOL { \
170  int count = 0; \
171  for (int y = 0; y < kernelSize.y; y++) { \
172  for (int x = 0; x < kernelSize.x; x++) { \
173  ivec2 idx = tileCoord + ivec2(x, y); \
174  if IN_BOUNDS(idx, ivec2(0), inputTileSize) { \
175  vec4 data = TEXTURE_LOAD(inputData, inputTileOffset + idx); \
176  pool += data;\
177  count += 1; \
178  } \
179  } \
180  } \
181  pool = pool / float(count); \
182 }
183 
184 #endif // MAX_POOL
185 
186 void main() {
187  ivec2 inputSize = textureSize(inputData, 0);
188  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
189 
190  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
191  ivec2 tileCoord = texelCoord % outputTileSize; // in-tile coordinates
192  tileCoord = input_stride * tileCoord - input_padding;
193 
194  ivec2 inputTileOffset = tile * inputTileSize;
195 
196 #if MAX_POOL
197  vec4 pool = vec4(0);
198 #else
199  highp vec4 pool = vec4(0);
200 #endif
201 
202  POOL;
203 
204  outputData = TEXTURE_STORE(pool);
205 }
206 
207 #else
208 
209 // no tiling
210 #if MAX_POOL
211 
212 #define POOL { \
213  pool = vec4(MIN_FLOAT); \
214  for (int y = 0; y < kernelSize.y; y++) { \
215  for (int x = 0; x < kernelSize.x; x++) { \
216  ivec2 idx = texelCoord + ivec2(x, y); \
217  if IN_BOUNDS(idx, ivec2(0), inputSize) { \
218  vec4 data = TEXTURE_LOAD(inputData, idx); \
219  pool = max(pool, data); \
220  } \
221  } \
222  } \
223 }
224 
225 #else
226 
227 #define POOL { \
228  int count = 0; \
229  for (int y = 0; y < kernelSize.y; y++) { \
230  for (int x = 0; x < kernelSize.x; x++) { \
231  ivec2 idx = texelCoord + ivec2(x, y); \
232  if IN_BOUNDS(idx, ivec2(0), inputSize) { \
233  vec4 data = TEXTURE_LOAD(inputData, idx); \
234  pool += data; \
235  count += 1; \
236  } \
237  } \
238  } \
239  pool = pool / float(count); \
240 }
241 
242 #endif // MAX_POOL
243 
244 void main() {
245  ivec2 inputSize = textureSize(inputData, 0);
246  ivec2 texelCoord = input_stride * ivec2(v_texCoord * vec2(outputSize)) - input_padding;
247 #if MAX_POOL
248  vec4 pool = vec4(0);
249 #else
250  highp vec4 pool = vec4(0);
251 #endif
252 
253  POOL;
254 
255  outputData = TEXTURE_STORE(pool);
256 }
257 #endif // TILED_POOLING
258 
259 )GLSL";
260 
261 namespace caffe2 {
262 
263 template <typename OPBase>
264 static void computeOutputHW(OPBase* op, int H, int W, int* OH, int* OW) {
265  Tensor<CPUContext> input, output;
266  input.Resize(1, 1, H, W);
267  op->SetOutputSize(input, &output, 1);
268  CAFFE_ENFORCE_EQ(output.ndim(), 4);
269  *OH = output.dim(2);
270  *OW = output.dim(3);
271 }
272 
273 template <typename T, GLPool::PoolType poolType>
274 class GLPoolOp final : public ConvPoolOpBase<CPUContext>, ImageAllocator<float16_t> {
275  public:
276  GLPoolOp(const OperatorDef& operator_def, Workspace* ws)
277  : ConvPoolOpBase<CPUContext>(operator_def, ws) {
278  OPERATOR_NEEDS_FEATURE(order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
279  CAFFE_ENFORCE(dilation_h() == 1 && dilation_w() == 1,
280  "Pooling op does not support dilation right now.");
281  if (!global_pooling_) {
282  CAFFE_ENFORCE(pad_t() < kernel_h() && pad_b() < kernel_h() && pad_l() < kernel_w() &&
283  pad_r() < kernel_w(),
284  "Pad should be smaller than kernel.");
285  }
286  }
287 
288  bool RunOnDeviceWithOrderNCHW() override {
289  const GLImageVector<T>& input = OperatorBase::Inputs()[0]->template Get<GLImageVector<T>>();
290  const int num_images = input.size();
291  const int input_channels = input.channels();
292  const int input_width = input.width();
293  const int input_height = input.height();
294 
295  int output_height;
296  int output_width;
297  const int output_channels = input_channels;
298 
299  computeOutputHW(this, input_height, input_width, &output_height, &output_width);
300 
301  int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
302 
303  const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
304  const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
305 
307  num_images, output_width, output_height, output_channels, output_tile_x, output_tile_y, is_last);
308 
309  GLPool::descriptor geometry{input_channels,
310  {kernel_w(), kernel_h()},
311  {pad_l(), pad_t()},
312  {stride_w(), stride_h()},
313  {input_width, input_height},
314  {output_height, output_width}};
315 
316  if (!glPool_) {
317  LOG(INFO) << input_channels << ": " << input_height << " X " << input_width << " => " << output_channels << ": "
318  << output_height << " X " << output_width << " Kernel: " << kernel_w() << "X" << kernel_h()
319  << " Tiling: " << input_tile_x << "X" << input_tile_y;
320 
321  glPool_.reset(new GLPool(geometry, poolType, input_tile_x > 1 || input_tile_y > 1));
322  }
323 
324  glPool_->pool(input, *output);
325 
326  OperatorBase::Outputs()[0]->Reset(output);
327 
328  return true;
329  }
330 
331  private:
332  std::unique_ptr<GLPool> glPool_;
333 };
334 
335 namespace {
336 REGISTER_CPU_OPERATOR(OpenGLAveragePool, GLPoolOp<float16_t, GLPool::AveragePool>);
337 REGISTER_CPU_OPERATOR(OpenGLMaxPool, GLPoolOp<float16_t, GLPool::MaxPool>);
338 OPERATOR_SCHEMA(OpenGLAveragePool).NumInputs(1).NumOutputs(1);
339 OPERATOR_SCHEMA(OpenGLMaxPool).NumInputs(1).NumOutputs(1);
340 }; // namespace
341 }; // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: GLPool.cc:9