Caffe2 - C++ API
A deep learning, cross platform ML framework
GLInstanceNorm.cc
1 
2 #include "../core/GLFilter.h"
3 #include "../core/GLImage.h"
4 #include "../core/ImageAllocator.h"
5 
6 #include "caffe2/core/operator.h"
7 #include "caffe2/core/timer.h"
8 #include <iostream>
9 #include <vector>
10 
11 class GLReduce : public GLFilter {
12  public:
13  binding* inputSize;
14  binding* outputSize;
15  binding* tileSize;
16  binding* inv_pixel_count;
17  binding* epsilon;
18  binding* inputData;
19  binding* averageData;
20 
21  bool compute_inv_stdev;
22  bool compute_norm;
23 
24  const std::vector<binding*> input_bindings(bool compute_norm_) {
25  std::vector<binding*> bindings({BINDING(inputSize),
26  BINDING(outputSize),
27  BINDING(tileSize),
28  BINDING(inv_pixel_count),
29  BINDING(epsilon),
30  BINDING(inputData)});
31  if (compute_norm_) {
32  bindings.push_back(BINDING(averageData));
33  }
34  return bindings;
35  }
36 
37  GLReduce(bool compute_inv_stdev_ = false, bool compute_norm_ = false)
38  : GLFilter("GLReduce",
39  vertex_shader,
40  fragment_shader,
41  input_bindings(compute_norm_),
42  {/* no uniform_blocks_bindings */},
43  {/* no attributes */},
44  {{"COMPUTE_INV_STDEV", caffe2::to_string((int)compute_inv_stdev_)},
45  {"COMPUTE_NORM", caffe2::to_string((int)compute_norm_)}}),
46  compute_inv_stdev(compute_inv_stdev_),
47  compute_norm(compute_norm_) {}
48 
49  template <typename T>
50  void reduce(const GLImage<T>* input_image,
51  const GLImage<T>* output_image,
52  int tile_size_x,
53  int tile_size_y,
54  float inv_pixel_count_ = 1.0,
55  float epsilon_ = 0.0);
56 
57  template <typename T>
58  void norm(const GLImage<T>* input_image,
59  const GLImage<T>* avg_image,
60  const GLImage<T>* output_image,
61  int tile_size_x,
62  int tile_size_y,
63  float inv_pixel_count_);
64 
65  static const char* fragment_shader;
66 };
67 
68 // MARK: GLSL
69 
70 const char* GLReduce::fragment_shader = R"GLSL(#version 300 es
71 
72 #define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV)
73 #define COMPUTE_NORM $(COMPUTE_NORM)
74 
75 precision mediump float;
76 precision mediump int;
77 
78 in highp vec2 v_texCoord;
79 
80 uniform ivec2 inputSize;
81 uniform ivec2 outputSize;
82 uniform ivec2 tileSize;
83 uniform float inv_pixel_count;
84 uniform float epsilon;
85 
86 #if COMPUTE_NORM
87 TEXTURE_INPUT(averageData);
88 #endif
89 
90 TEXTURE_INPUT(inputData);
91 TEXTURE_OUTPUT(0, outputData);
92 
93 void main() {
94  ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize));
95  ivec2 texelCoord = outputCoord * tileSize;
96  ivec2 sumArea = min(tileSize, inputSize - texelCoord);
97  highp vec4 sum = vec4(0.0);
98 
99 #if COMPUTE_NORM
100  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
101 #endif
102 
103  for (int y = 0; y < sumArea.y; y++) {
104  for (int x = 0; x < sumArea.x; x++) {
105  ivec2 idx = texelCoord + ivec2(x, y);
106  vec4 val = TEXTURE_LOAD(inputData, idx);
107 #if COMPUTE_NORM
108  val -= avg;
109  sum += val * val;
110 #else
111  sum += val;
112 #endif
113  }
114  }
115 
116 #if COMPUTE_INV_STDEV
117  outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon)));
118 #elif COMPUTE_NORM
119  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count));
120 #else
121  outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon));
122 #endif
123 }
124 
125 )GLSL";
126 
127 template <typename T>
128 void GLReduce::reduce(const GLImage<T>* input_image,
129  const GLImage<T>* output_image,
130  int tile_size_x,
131  int tile_size_y,
132  float inv_pixel_count_,
133  float epsilon_) {
134  int input_slices = input_image->slices;
135  int output_slices = output_image->slices;
136 
137  for (int is = 0; is < input_slices; is++) {
138  std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
139 
140  run(input_attachments,
141  {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
142  [&]() {
143  glUniform2i(inputSize->location, input_image->width, input_image->height);
144  glUniform2i(outputSize->location, output_image->width, output_image->height);
145  glUniform2i(tileSize->location, tile_size_x, tile_size_y);
146  glUniform1f(inv_pixel_count->location, inv_pixel_count_);
147  glUniform1f(epsilon->location, epsilon_);
148  },
149  output_image->width,
150  output_image->height);
151  }
152 }
153 
154 template <typename T>
155 void GLReduce::norm(const GLImage<T>* input_image,
156  const GLImage<T>* avg_image,
157  const GLImage<T>* output_image,
158  int tile_size_x,
159  int tile_size_y,
160  float inv_pixel_count_) {
161  int input_slices = input_image->slices;
162  int output_slices = output_image->slices;
163 
164  for (int is = 0; is < input_slices; is++) {
165  std::vector<texture_attachment> input_attachments(
166  {{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
167 
168  run(input_attachments,
169  {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
170  [&]() {
171  glUniform2i(inputSize->location, input_image->width, input_image->height);
172  glUniform2i(outputSize->location, output_image->width, output_image->height);
173  glUniform2i(tileSize->location, tile_size_x, tile_size_y);
174  glUniform1f(inv_pixel_count->location, inv_pixel_count_);
175  },
176  output_image->width,
177  output_image->height);
178  }
179 }
180 
181 class GLScale : public GLFilter {
182  public:
183  binding* outputSize;
184  binding* inputData;
185  binding* averageData;
186  binding* normData;
187 
188  binding* scale_factor;
189  binding* bias_factor;
190  binding* prelu_scale_factor;
191 
192  const int channels;
193  const float* scale;
194  const float* bias;
195  const float* prelu_scale;
196  const int prelu_size;
197 
198  const std::vector<binding*> input_bindings(bool fuse_prelu) {
199  std::vector<binding*> bindings({BINDING(outputSize),
200  BINDING(scale_factor),
201  BINDING(bias_factor),
202  BINDING(inputData),
203  BINDING(averageData),
204  BINDING(normData)});
205  if (fuse_prelu) {
206  bindings.push_back(prelu_scale_factor = new binding({"prelu_scale_factor"}));
207  }
208  return bindings;
209  }
210 
211  GLScale(const int _channels,
212  const float* _scale,
213  const float* _bias,
214  const float* _prelu_scale = nullptr,
215  const int _prelu_size = 0)
216  : GLFilter("GLScale",
217  vertex_shader,
218  fragment_shader,
219  input_bindings(_prelu_scale != nullptr),
220  {/* no uniform blocks */},
221  {/* no attributes */},
222  {{"FUSE_PRELU", caffe2::to_string(_prelu_scale != nullptr)}}),
223  channels(_channels),
224  scale(_scale),
225  bias(_bias),
226  prelu_scale(_prelu_scale),
227  prelu_size(_prelu_size) {}
228 
229  template <typename T>
230  void scale_and_shift(const GLImage<T>* input_image,
231  const GLImage<T>* avg_image,
232  const GLImage<T>* norm_image,
233  const GLImage<T>* output_image);
234 
235  static const char* fragment_shader;
236 };
237 
238 // MARK: GLSL
239 
240 const char* GLScale::fragment_shader = R"GLSL(#version 300 es
241 
242 #define FUSE_PRELU $(FUSE_PRELU)
243 
244 precision mediump float;
245 precision mediump int;
246 
247 in highp vec2 v_texCoord;
248 uniform ivec2 outputSize;
249 uniform vec4 scale_factor;
250 uniform vec4 bias_factor;
251 
252 #if FUSE_PRELU
253 uniform vec4 prelu_scale_factor;
254 #endif
255 
256 TEXTURE_INPUT(inputData);
257 TEXTURE_INPUT(averageData);
258 TEXTURE_INPUT(normData);
259 TEXTURE_OUTPUT(0, outputData);
260 
261 void main() {
262  ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize));
263 
264  vec4 val = TEXTURE_LOAD(inputData, texelCoord);
265  vec4 avg = TEXTURE_LOAD(averageData, ivec2(0));
266  vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0));
267 
268 #if FUSE_PRELU
269  vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor;
270  vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0))));
271  outputData = TEXTURE_STORE(o);
272 #else
273  vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor;
274  outputData = TEXTURE_STORE(o);
275 #endif
276 }
277 
278 )GLSL";
279 
280 template <typename T>
281 void GLScale::scale_and_shift(const GLImage<T>* input_image,
282  const GLImage<T>* avg_image,
283  const GLImage<T>* norm_image,
284  const GLImage<T>* output_image) {
285  int input_slices = input_image->slices;
286  int output_slices = output_image->slices;
287 
288  for (int is = 0; is < input_slices; is++) {
289  std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
290  {avg_image->textures[is], averageData},
291  {norm_image->textures[is], normData}});
292 
293  run(input_attachments,
294  {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
295  [&]() {
296  glUniform2i(outputSize->location, output_image->width, output_image->height);
297  glUniform4f(scale_factor->location,
298  scale[4 * is],
299  channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
300  channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
301  channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
302  glUniform4f(bias_factor->location,
303  bias[4 * is],
304  channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
305  channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
306  channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
307  if (prelu_scale != nullptr) {
308  glUniform4f(prelu_scale_factor->location,
309  prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
310  channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
311  : prelu_scale[0],
312  channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
313  : prelu_scale[0],
314  channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
315  : prelu_scale[0]);
316  }
317  },
318  output_image->width,
319  output_image->height);
320  }
321 }
322 
323 namespace caffe2 {
324 template <class T, bool FUSE_PRELU>
325 class OpenGLInstanceNormPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
326  public:
327  OpenGLInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
328  : Operator<CPUContext>(operator_def, ws),
329  epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
330  order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
331  CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
332  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
333  }
334 
335  bool RunOnDevice() override {
336  const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
337  const int num_images = input.size();
338  const int input_channels = input.channels();
339  const int input_width = input.width();
340  const int input_height = input.height();
341 
342  const int output_channels = input_channels;
343  const int output_width = input_width;
344  const int output_height = input_height;
345 
346  int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
347 
348  const int tile_size_x = 16;
349  const int tile_size_y = 16;
350  int avg_buf_width = input_width;
351  int avg_buf_height = input_height;
352 
353  vector<GLImageVector<T>*> reduce_buf;
354  while (reduce_buf.size() == 0 ||
355  (avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
356  avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
357  avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
358 
359  reduce_buf.push_back(
360  ImageAllocator<T>::newImage(1, avg_buf_width, avg_buf_height, output_channels));
361  }
362 
363  GLImageVector<T>* avg = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
364  GLImageVector<T>* inv_stdev = ImageAllocator<T>::newImage(num_images, 1, 1, output_channels);
366  num_images, output_width, output_height, output_channels, is_last);
367  const float* prelu_data = nullptr;
368  int prelu_size = 0;
369  if (FUSE_PRELU) {
370  DCHECK_EQ(InputSize(), 4);
371  const auto& prelu_scale = Input(PRELU);
372  prelu_data = prelu_scale.template data<float>();
373  prelu_size = prelu_scale.size();
374  } else {
375  DCHECK_EQ(InputSize(), 3);
376  }
377 
378  const auto& scale = Input(SCALE);
379  const auto& bias = Input(BIAS);
380 
381  if (!f_reduce) {
382  f_reduce.reset(new GLReduce());
383  f_norm.reset(new GLReduce(false, true));
384  f_stdDev.reset(new GLReduce(true, false));
385  f_scale.reset(new GLScale(input_channels,
386  scale.template data<float>(),
387  bias.template data<float>(),
388  prelu_data,
389  prelu_size));
390  }
391 
392  for (int i = 0; i < num_images; i++) {
393  for (int k = 0; k < reduce_buf.size() + 1; k++) {
394  const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
395  GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
396 
397  float norm = k < reduce_buf.size()
398  ? 1.0 / (tile_size_x * tile_size_y)
399  : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
400  (float)(input_width * input_height);
401  const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
402  const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
403  f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
404  }
405 
406  for (int k = 0; k < reduce_buf.size() + 1; k++) {
407  const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
408  GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
409 
410  float norm = k < reduce_buf.size()
411  ? 1.0 / (tile_size_x * tile_size_y)
412  : (float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
413  (float)(input_width * input_height);
414 
415  if (k == 0) {
416  f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
417  } else if (k < reduce_buf.size()) {
418  f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
419  } else {
420  const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
421  const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
422  f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
423  }
424  }
425 
426  f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
427  }
428  Outputs()[OUTPUT]->Reset(output);
429  if (OutputSize() > 1) {
430  Outputs()[MEAN]->Reset(avg);
431  Outputs()[INV_STDEV]->Reset(inv_stdev);
432  } else {
433  delete avg;
434  delete inv_stdev;
435  }
436  for (auto&& rb : reduce_buf) {
437  delete rb;
438  }
439 
440  return true;
441  }
442 
443  private:
444  float epsilon_;
445  StorageOrder order_;
446  std::unique_ptr<GLReduce> f_reduce;
447  std::unique_ptr<GLReduce> f_norm;
448  std::unique_ptr<GLReduce> f_stdDev;
449  std::unique_ptr<GLScale> f_scale;
450 
451  INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
452  OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
453 };
454 
455 REGISTER_CPU_OPERATOR(OpenGLInstanceNorm, OpenGLInstanceNormPReluOp<float16_t, false>);
456 OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
457 REGISTER_CPU_OPERATOR(OpenGLInstanceNormPRelu, OpenGLInstanceNormPReluOp<float16_t, true>);
458 OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
459 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...