Caffe2 - C++ API
A deep learning, cross platform ML framework
GLPRelu.cc
1 
2 #include "../core/GLFilter.h"
3 #include "../core/GLImage.h"
4 #include "../core/ImageAllocator.h"
5 
6 #include "caffe2/core/operator.h"
7 #include "caffe2/core/timer.h"
8 #include <iostream>
9 #include <vector>
10 
11 class GLPRelu : public GLFilter {
12  public:
13  typedef enum { PRelu = 0, Relu = 1 } ReluType;
14 
15  const float* scale;
16 
17  binding* inputData;
18  binding* scale_block;
19 
20  const int scale_size;
21  const int channels;
22  const int output_tile_x;
23  const int output_tile_y;
24  const int output_tile_width;
25  const int output_tile_height;
26 
27  GLPRelu(
28  const float* _scale,
29  const int _scale_size,
30  const int _channels,
31  int _output_tile_x,
32  int _output_tile_y,
33  int _output_tile_width,
34  int _output_tile_height)
35  : GLFilter(
36  "GLPRelu",
37  vertex_shader,
38  fragment_shader,
39  std::vector<binding*>({BINDING(inputData)}),
40  std::vector<binding*>({BINDING(scale_block)}),
41  {/* no attributes */},
42  {{"USE_RELU", caffe2::to_string(PRelu)},
43  {"OUTPUT_TILES",
44  caffe2::to_string(_output_tile_x * _output_tile_y)},
45  {"OUTPUT_TILE_X", caffe2::to_string(_output_tile_x)},
46  {"OUTPUT_TILE_WIDTH", caffe2::to_string(_output_tile_width)},
47  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(_output_tile_height)},
48  {"TILED_PRELU",
49  caffe2::to_string(_output_tile_x > 1 || _output_tile_y > 1)}}),
50  scale(_scale),
51  scale_size(_scale_size),
52  channels(_channels),
53  output_tile_x(_output_tile_x),
54  output_tile_y(_output_tile_y),
55  output_tile_width(_output_tile_width),
56  output_tile_height(_output_tile_height) {}
57 
58  GLPRelu(const int _channels)
59  : GLFilter("GLRelu",
60  vertex_shader,
61  fragment_shader,
62  std::vector<binding*>({BINDING(inputData)}),
63  {/* no uniform blocks */},
64  {/* no attributes */},
65  {{"USE_RELU", caffe2::to_string(Relu)},
66  {"OUTPUT_TILES", caffe2::to_string(1)},
67  {"OUTPUT_TILE_X", caffe2::to_string(1)},
68  {"OUTPUT_TILE_WIDTH", caffe2::to_string(1)},
69  {"OUTPUT_TILE_HEIGHT", caffe2::to_string(1)},
70  {"TILED_PRELU", caffe2::to_string(0)}}),
71  scale(nullptr),
72  scale_block(nullptr),
73  scale_size(0),
74  channels(_channels),
75  output_tile_x(1),
76  output_tile_y(1),
77  output_tile_width(1),
78  output_tile_height(1) {}
79 
80  template <typename T>
81  void prelu(const GLImageVector<T>& input_images,
82  const GLImageVector<T>& output_images,
83  GLPRelu::ReluType reluType);
84 
85  static const char* fragment_shader;
86 };
87 
88 // MARK: GLSL
89 
90 const char* GLPRelu::fragment_shader = R"GLSL(#version 300 es
91 #define TILED_PRELU $(TILED_PRELU)
92 #define USE_RELU $(USE_RELU)
93 
94 // tiling
95 #define OUTPUT_TILES $(OUTPUT_TILES)
96 #define OUTPUT_TILE_X $(OUTPUT_TILE_X)
97 #define OUTPUT_TILE_WIDTH $(OUTPUT_TILE_WIDTH)
98 #define OUTPUT_TILE_HEIGHT $(OUTPUT_TILE_HEIGHT)
99 
100 // common
101 precision mediump float;
102 precision highp int;
103 
104 TEXTURE_INPUT(inputData);
105 TEXTURE_OUTPUT(0, outputData);
106 
107 in highp vec2 v_texCoord;
108 
109 #if USE_RELU
110 
111 // Relu
112 void main() {
113  ivec2 inputSize = textureSize(inputData, 0);
114  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
115  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
116  outputData = TEXTURE_STORE(max(value, vec4(0.0)));
117 }
118 
119 #else
120 
121 #if TILED_PRELU
122 const ivec2 outputTileSize = ivec2(OUTPUT_TILE_WIDTH, OUTPUT_TILE_HEIGHT);
123 
124 layout (std140) uniform scale_block {
125  highp uvec4 scale[(OUTPUT_TILES + 1) / 2];
126 };
127 
128 void main() {
129  ivec2 inputSize = textureSize(inputData, 0);
130  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
131 
132  ivec2 tile = texelCoord / outputTileSize; // 2D output tile idx
133  int tileNum = OUTPUT_TILE_X * tile.y + tile.x; // 1D output tile idx
134 
135  // outputData = value > 0 ? value : value * weight;
136  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
137  vec4 preluValue = (tileNum % 2 == 0) ? unpackHalf4x16(scale[tileNum/2].xy) : unpackHalf4x16(scale[tileNum/2].zw);
138  value = mix(value * preluValue, value, vec4(greaterThan(value, vec4(0))));
139  outputData = TEXTURE_STORE(value);
140 }
141 #else
142 layout (std140) uniform scale_block {
143  highp uvec4 scale;
144 };
145 void main() {
146  ivec2 inputSize = textureSize(inputData, 0);
147  ivec2 texelCoord = ivec2(v_texCoord * vec2(inputSize));
148 
149  // outputData = value > 0 ? value : value * weight;
150  vec4 value = TEXTURE_LOAD(inputData, texelCoord);
151  value = mix(value * unpackHalf4x16(scale.xy), value, vec4(greaterThan(value, vec4(0))));
152  outputData = TEXTURE_STORE(value);
153 }
154 #endif // TILED_PRELU
155 
156 #endif // USE_RELU
157 
158 )GLSL";
159 
160 template <typename T>
161 void GLPRelu::prelu(const GLImageVector<T>& input_images,
162  const GLImageVector<T>& output_images,
163  GLPRelu::ReluType reluType) {
164  int num_images = input_images.size();
165  for (int i = 0; i < num_images; i++) {
166  GLImage<T>* input_image = input_images[i];
167  GLImage<T>* output_image = output_images[i];
168  int input_slices = input_image->slices;
169  int output_slices = output_image->slices;
170 
171  for (int is = 0; is < input_slices; is++) {
172  if (reluType == PRelu) {
173  attach_uniform_buffer<float16_t>(scale_block, 0, [&](float16_t* data, size_t size) {
174  int output_tiles = output_tile_x * output_tile_y;
175  for (int j = 0, k = 4 * is * output_tiles;
176  k < std::min(channels, 4 * (is + 1) * output_tiles);
177  j++, k++) {
178  data[j] = scale_size == channels ? scale[k] : scale[0];
179  }
180  });
181  }
182 
183  std::vector<texture_attachment> input_attachments;
184 
185  input_attachments.push_back({input_image->textures[is], inputData});
186 
187  run(input_attachments,
188  {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
189  [&]() {},
190  output_image->texture_width,
191  output_image->texture_height);
192  }
193  }
194 }
195 
196 namespace caffe2 {
197 template <typename T, GLPRelu::ReluType reluType>
198 class OpenGLPReluOp final : public Operator<CPUContext>, ImageAllocator<T> {
199  public:
200  OpenGLPReluOp(const OperatorDef& operator_def, Workspace* ws)
201  : Operator<CPUContext>(operator_def, ws),
202  order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
203  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "OpenGL only supports NCHW order.");
204  }
205 
206  bool RunOnDevice() override {
207  const GLImageVector<T>& input = Inputs()[0]->template Get<GLImageVector<T>>();
208  const int num_images = input.size();
209  const int input_channels = input.channels();
210  const int input_width = input.width();
211  const int input_height = input.height();
212 
213  const int output_channels = input_channels;
214  const int output_width = input_width;
215  const int output_height = input_height;
216 
217  int is_last = OperatorBase::GetSingleArgument<int>("is_last", 0);
218 
219  const int input_tile_x = input.tile_x(), input_tile_y = input.tile_y();
220  const int output_tile_x = input_tile_x, output_tile_y = input_tile_y;
221  if (input_tile_x > 1 || input_tile_y > 1) {
222  CAFFE_ENFORCE_EQ(input.slices(), 1, "Input needs to be tiled in a single texture");
223  }
224 
225  GLImageVector<T>* output = ImageAllocator<T>::newImage(num_images,
226  output_width,
227  output_height,
228  output_channels,
229  output_tile_x,
230  output_tile_y,
231  is_last);
232 
233  const auto* scale = reluType == GLPRelu::PRelu ? &Input(1) : nullptr;
234 
235  if (!_prelu) {
236  if (reluType == GLPRelu::PRelu) {
237  _prelu.reset(new GLPRelu(scale->template data<float>(),
238  scale->size(),
239  input_channels,
240  output_tile_x,
241  output_tile_y,
242  output_width,
243  output_height));
244  } else {
245  _prelu.reset(new GLPRelu(input_channels));
246  }
247  }
248 
249  _prelu->prelu(input, *output, reluType);
250 
251  Outputs()[0]->Reset(output);
252 
253  return true;
254  }
255 
256  private:
257  StorageOrder order_;
258  std::unique_ptr<GLPRelu> _prelu;
259 };
260 
261 REGISTER_CPU_OPERATOR(OpenGLPRelu, OpenGLPReluOp<float16_t, GLPRelu::PRelu>);
262 OPERATOR_SCHEMA(OpenGLPRelu)
263  .NumInputs(2)
264  .NumOutputs(1)
265  .AllowInplace({{0, 0}})
266  .IdenticalTypeAndShape();
267 REGISTER_CPU_OPERATOR(OpenGLRelu, OpenGLPReluOp<float16_t, GLPRelu::Relu>);
268 OPERATOR_SCHEMA(OpenGLRelu)
269  .NumInputs(1)
270  .NumOutputs(1)
271  .AllowInplace({{0, 0}})
272  .IdenticalTypeAndShape();
273 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...