2 #include "../core/GLFilter.h" 3 #include "../core/GLImage.h" 4 #include "../core/ImageAllocator.h" 6 #include "caffe2/core/operator.h" 7 #include "caffe2/core/timer.h" 16 binding* inv_pixel_count;
21 bool compute_inv_stdev;
24 const std::vector<binding*> input_bindings(
bool compute_norm_) {
25 std::vector<binding*> bindings({BINDING(inputSize),
28 BINDING(inv_pixel_count),
32 bindings.push_back(BINDING(averageData));
37 GLReduce(
bool compute_inv_stdev_ =
false,
bool compute_norm_ =
false)
41 input_bindings(compute_norm_),
44 {{
"COMPUTE_INV_STDEV", caffe2::to_string((
int)compute_inv_stdev_)},
45 {
"COMPUTE_NORM", caffe2::to_string((
int)compute_norm_)}}),
46 compute_inv_stdev(compute_inv_stdev_),
47 compute_norm(compute_norm_) {}
54 float inv_pixel_count_ = 1.0,
55 float epsilon_ = 0.0);
63 float inv_pixel_count_);
65 static const char* fragment_shader;
70 const char* GLReduce::fragment_shader = R
"GLSL(#version 300 es 72 #define COMPUTE_INV_STDEV $(COMPUTE_INV_STDEV) 73 #define COMPUTE_NORM $(COMPUTE_NORM) 75 precision mediump float; 76 precision mediump int; 78 in highp vec2 v_texCoord; 80 uniform ivec2 inputSize; 81 uniform ivec2 outputSize; 82 uniform ivec2 tileSize; 83 uniform float inv_pixel_count; 84 uniform float epsilon; 87 TEXTURE_INPUT(averageData); 90 TEXTURE_INPUT(inputData); 91 TEXTURE_OUTPUT(0, outputData); 94 ivec2 outputCoord = ivec2(v_texCoord * vec2(outputSize)); 95 ivec2 texelCoord = outputCoord * tileSize; 96 ivec2 sumArea = min(tileSize, inputSize - texelCoord); 97 highp vec4 sum = vec4(0.0); 100 vec4 avg = TEXTURE_LOAD(averageData, ivec2(0)); 103 for (int y = 0; y < sumArea.y; y++) { 104 for (int x = 0; x < sumArea.x; x++) { 105 ivec2 idx = texelCoord + ivec2(x, y); 106 vec4 val = TEXTURE_LOAD(inputData, idx); 116 #if COMPUTE_INV_STDEV 117 outputData = TEXTURE_STORE(inversesqrt(sum * vec4(inv_pixel_count) + vec4(epsilon))); 119 outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count)); 121 outputData = TEXTURE_STORE(sum * vec4(inv_pixel_count) + vec4(epsilon)); 127 template <
typename T>
128 void GLReduce::reduce(
const GLImage<T>* input_image,
132 float inv_pixel_count_,
134 int input_slices = input_image->slices;
135 int output_slices = output_image->slices;
137 for (
int is = 0; is < input_slices; is++) {
138 std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData}});
140 run(input_attachments,
141 {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
143 glUniform2i(inputSize->location, input_image->width, input_image->height);
144 glUniform2i(outputSize->location, output_image->width, output_image->height);
145 glUniform2i(tileSize->location, tile_size_x, tile_size_y);
146 glUniform1f(inv_pixel_count->location, inv_pixel_count_);
147 glUniform1f(epsilon->location, epsilon_);
150 output_image->height);
154 template <
typename T>
155 void GLReduce::norm(
const GLImage<T>* input_image,
160 float inv_pixel_count_) {
161 int input_slices = input_image->slices;
162 int output_slices = output_image->slices;
164 for (
int is = 0; is < input_slices; is++) {
165 std::vector<texture_attachment> input_attachments(
166 {{input_image->textures[is], inputData}, {avg_image->textures[is], averageData}});
168 run(input_attachments,
169 {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
171 glUniform2i(inputSize->location, input_image->width, input_image->height);
172 glUniform2i(outputSize->location, output_image->width, output_image->height);
173 glUniform2i(tileSize->location, tile_size_x, tile_size_y);
174 glUniform1f(inv_pixel_count->location, inv_pixel_count_);
177 output_image->height);
185 binding* averageData;
188 binding* scale_factor;
189 binding* bias_factor;
190 binding* prelu_scale_factor;
195 const float* prelu_scale;
196 const int prelu_size;
198 const std::vector<binding*> input_bindings(
bool fuse_prelu) {
199 std::vector<binding*> bindings({BINDING(outputSize),
200 BINDING(scale_factor),
201 BINDING(bias_factor),
203 BINDING(averageData),
206 bindings.push_back(prelu_scale_factor =
new binding({
"prelu_scale_factor"}));
214 const float* _prelu_scale =
nullptr,
215 const int _prelu_size = 0)
219 input_bindings(_prelu_scale !=
nullptr),
222 {{
"FUSE_PRELU", caffe2::to_string(_prelu_scale !=
nullptr)}}),
226 prelu_scale(_prelu_scale),
227 prelu_size(_prelu_size) {}
229 template <
typename T>
230 void scale_and_shift(
const GLImage<T>* input_image,
235 static const char* fragment_shader;
240 const char* GLScale::fragment_shader = R
"GLSL(#version 300 es 242 #define FUSE_PRELU $(FUSE_PRELU) 244 precision mediump float; 245 precision mediump int; 247 in highp vec2 v_texCoord; 248 uniform ivec2 outputSize; 249 uniform vec4 scale_factor; 250 uniform vec4 bias_factor; 253 uniform vec4 prelu_scale_factor; 256 TEXTURE_INPUT(inputData); 257 TEXTURE_INPUT(averageData); 258 TEXTURE_INPUT(normData); 259 TEXTURE_OUTPUT(0, outputData); 262 ivec2 texelCoord = ivec2(v_texCoord * vec2(outputSize)); 264 vec4 val = TEXTURE_LOAD(inputData, texelCoord); 265 vec4 avg = TEXTURE_LOAD(averageData, ivec2(0)); 266 vec4 inv_stdev = TEXTURE_LOAD(normData, ivec2(0)); 269 vec4 result = (val - avg) * inv_stdev * scale_factor + bias_factor; 270 vec4 o = mix(result * prelu_scale_factor, result, vec4(greaterThan(result, vec4(0)))); 271 outputData = TEXTURE_STORE(o); 273 vec4 o = (val - avg) * inv_stdev * scale_factor + bias_factor; 274 outputData = TEXTURE_STORE(o); 280 template <
typename T>
281 void GLScale::scale_and_shift(
const GLImage<T>* input_image,
285 int input_slices = input_image->slices;
286 int output_slices = output_image->slices;
288 for (
int is = 0; is < input_slices; is++) {
289 std::vector<texture_attachment> input_attachments({{input_image->textures[is], inputData},
290 {avg_image->textures[is], averageData},
291 {norm_image->textures[is], normData}});
293 run(input_attachments,
294 {output_image->textures.begin() + is, output_image->textures.begin() + is + 1},
296 glUniform2i(outputSize->location, output_image->width, output_image->height);
297 glUniform4f(scale_factor->location,
299 channels > 4 * is + 1 ? scale[4 * is + 1] : 0,
300 channels > 4 * is + 2 ? scale[4 * is + 2] : 0,
301 channels > 4 * is + 3 ? scale[4 * is + 3] : 0);
302 glUniform4f(bias_factor->location,
304 channels > 4 * is + 1 ? bias[4 * is + 1] : 0,
305 channels > 4 * is + 2 ? bias[4 * is + 2] : 0,
306 channels > 4 * is + 3 ? bias[4 * is + 3] : 0);
307 if (prelu_scale !=
nullptr) {
308 glUniform4f(prelu_scale_factor->location,
309 prelu_size == channels ? prelu_scale[4 * is] : prelu_scale[0],
310 channels > 4 * is + 1 && prelu_size == channels ? prelu_scale[4 * is + 1]
312 channels > 4 * is + 2 && prelu_size == channels ? prelu_scale[4 * is + 2]
314 channels > 4 * is + 3 && prelu_size == channels ? prelu_scale[4 * is + 3]
319 output_image->height);
324 template <
class T,
bool FUSE_PRELU>
329 epsilon_(OperatorBase::GetSingleArgument<float>(
"epsilon", 1e-5)),
330 order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>(
"order",
"NCHW"))) {
331 CAFFE_ENFORCE(epsilon_ >= 0,
"Must pass a nonnegative epsilon.");
332 OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW,
"Metal only supports NCHW order.");
335 bool RunOnDevice()
override {
336 const GLImageVector<T>& input = Inputs()[INPUT]->template Get<GLImageVector<T>>();
337 const int num_images = input.size();
338 const int input_channels = input.channels();
339 const int input_width = input.width();
340 const int input_height = input.height();
342 const int output_channels = input_channels;
343 const int output_width = input_width;
344 const int output_height = input_height;
346 int is_last = OperatorBase::GetSingleArgument<int>(
"is_last", 0);
348 const int tile_size_x = 16;
349 const int tile_size_y = 16;
350 int avg_buf_width = input_width;
351 int avg_buf_height = input_height;
353 vector<GLImageVector<T>*> reduce_buf;
354 while (reduce_buf.size() == 0 ||
355 (avg_buf_width > tile_size_x && avg_buf_height > tile_size_y)) {
356 avg_buf_width = (avg_buf_width + tile_size_x - 1) / tile_size_x;
357 avg_buf_height = (avg_buf_height + tile_size_y - 1) / tile_size_y;
359 reduce_buf.push_back(
366 num_images, output_width, output_height, output_channels, is_last);
367 const float* prelu_data =
nullptr;
370 DCHECK_EQ(InputSize(), 4);
371 const auto& prelu_scale = Input(PRELU);
372 prelu_data = prelu_scale.template data<float>();
373 prelu_size = prelu_scale.size();
375 DCHECK_EQ(InputSize(), 3);
378 const auto& scale = Input(SCALE);
379 const auto& bias = Input(BIAS);
383 f_norm.reset(
new GLReduce(
false,
true));
384 f_stdDev.reset(
new GLReduce(
true,
false));
385 f_scale.reset(
new GLScale(input_channels,
386 scale.template data<float>(),
387 bias.template data<float>(),
392 for (
int i = 0; i < num_images; i++) {
393 for (
int k = 0; k < reduce_buf.size() + 1; k++) {
394 const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
395 GLImage<T>* out = k == reduce_buf.size() ? (*avg)[i] : (*reduce_buf[k])[0];
397 float norm = k < reduce_buf.size()
398 ? 1.0 / (tile_size_x * tile_size_y)
399 : (
float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
400 (
float)(input_width * input_height);
401 const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
402 const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
403 f_reduce->reduce(in, out, running_tile_size_x, running_tile_size_y, norm);
406 for (
int k = 0; k < reduce_buf.size() + 1; k++) {
407 const GLImage<T>* in = k == 0 ? input[i] : (*reduce_buf[k - 1])[0];
408 GLImage<T>* out = k == reduce_buf.size() ? (*inv_stdev)[i] : (*reduce_buf[k])[0];
410 float norm = k < reduce_buf.size()
411 ? 1.0 / (tile_size_x * tile_size_y)
412 : (
float)pow(tile_size_x * tile_size_y, reduce_buf.size()) /
413 (
float)(input_width * input_height);
416 f_norm->norm(in, (*avg)[i], out, tile_size_x, tile_size_y, norm);
417 }
else if (k < reduce_buf.size()) {
418 f_reduce->reduce(in, out, tile_size_x, tile_size_y, norm);
420 const int running_tile_size_x = k < reduce_buf.size() ? tile_size_x : in->width;
421 const int running_tile_size_y = k < reduce_buf.size() ? tile_size_y : in->height;
422 f_stdDev->reduce(in, out, running_tile_size_x, running_tile_size_y, norm, epsilon_);
426 f_scale->scale_and_shift(input[i], (*avg)[i], (*inv_stdev)[i], (*output)[i]);
428 Outputs()[OUTPUT]->Reset(output);
429 if (OutputSize() > 1) {
430 Outputs()[MEAN]->Reset(avg);
431 Outputs()[INV_STDEV]->Reset(inv_stdev);
436 for (
auto&& rb : reduce_buf) {
446 std::unique_ptr<GLReduce> f_reduce;
447 std::unique_ptr<GLReduce> f_norm;
448 std::unique_ptr<GLReduce> f_stdDev;
449 std::unique_ptr<GLScale> f_scale;
451 INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
452 OUTPUT_TAGS(OUTPUT, MEAN, INV_STDEV);
456 OPERATOR_SCHEMA(OpenGLInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
458 OPERATOR_SCHEMA(OpenGLInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...