Caffe2 - C++ API
A deep learning, cross platform ML framework
instance_norm_gradient_op.cc
1 #include "caffe2/operators/instance_norm_op.h"
2 
3 namespace caffe2 {
4 
5 template <typename T, typename Context>
6 bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
7  const auto& input = Input(INPUT);
8  const auto& scale = Input(SCALE);
9  const auto& bias = Input(BIAS);
10  const auto& output_grad = Input(OUTPUT_GRAD);
11  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
12  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
13  auto input_grad = Output(INPUT_GRAD);
14  auto scale_grad = Output(SCALE_GRAD);
15  auto bias_grad = Output(BIAS_GRAD);
16  CAFFE_ENFORCE_EQ(4, input.ndim());
17  const int N = input.dim32(0);
18  const int H = input.dim32(1);
19  const int W = input.dim32(2);
20  const int C = input.dim32(3);
21  CAFFE_ENFORCE_EQ(1, scale.ndim());
22  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
23  CAFFE_ENFORCE_EQ(1, bias.ndim());
24  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
25  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
26  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
27  CAFFE_ENFORCE_EQ(H, output_grad.dim32(1));
28  CAFFE_ENFORCE_EQ(W, output_grad.dim32(2));
29  CAFFE_ENFORCE_EQ(C, output_grad.dim32(3));
30  input_grad->ResizeLike(input);
31  scale_grad->ResizeLike(scale);
32  bias_grad->ResizeLike(bias);
33 
34  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
35  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
36  EigenVectorArrayMap<T> scale_grad_arr(
37  scale_grad->template mutable_data<T>(), C);
38  EigenVectorArrayMap<T> bias_grad_arr(
39  bias_grad->template mutable_data<T>(), C);
40 
41  // Resize before we get into the per-instance loop
42  if (InputSize() < 5) {
43  mean_.Resize(N, C);
44  }
45  if (InputSize() < 6) {
46  inv_stdev_.Resize(N, C);
47  }
48 
49  // looping over per-instance and using Eigen blocks to extract out
50  // a chunk of channels
51  for (int n = 0; n < N; ++n) {
52  // All Eigen mats and arrs in here are per-instance.
53  ConstEigenArrayMap<T> input_mat(
54  input.template data<T>() + n * C * H * W, C, H * W);
55  ConstEigenArrayMap<T> output_grad_mat(
56  output_grad.template data<T>() + n * C * H * W, C, H * W);
57  EigenArrayMap<T> input_grad_mat(
58  input_grad->template mutable_data<T>() + n * C * H * W, C, H * W);
59 
60  // Compute mean if it wasn't passed in
61  if (InputSize() < 5) {
62  EigenVectorArrayMap<T> mean_mutable_arr(
63  mean_.template mutable_data<T>() + n * C, C);
64  mean_mutable_arr = input_mat.rowwise().mean();
65  }
66  CAFFE_ENFORCE_EQ(2, mean.ndim());
67  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
68  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
69  ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>() + n * C, C);
70 
71  // subtract mean
72  input_grad_mat = input_mat.colwise() - mean_arr;
73 
74  // Compute 1 / stdev if it wasn't passed in
75  if (InputSize() < 6) {
76  EigenVectorArrayMap<T> inv_stdev_mutable_arr(
77  inv_stdev_.template mutable_data<T>() + n * C, C);
78 
79  // Square the diffs along each channel and take the mean to get var
80  inv_stdev_mutable_arr = input_grad_mat.pow(2).rowwise().mean();
81  // sqrt to get stdev and take the inverse
82  inv_stdev_mutable_arr =
83  (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
84  }
85  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
86  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
87  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
88 
89  ConstEigenVectorArrayMap<T> inv_stdev_arr(
90  inv_stdev.template data<T>() + n * C, C);
91 
92  // for each channel
93  // dl/dbias = sum_j dl/dy_j
94  bias_grad_arr += output_grad_mat.rowwise().sum();
95  // for each channel
96  // dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
97  scale_grad_arr +=
98  ((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
99  .rowwise()
100  .sum();
101 
102  // dl/dx_j = this gross thing
103  // Derived gradient and manually massaged it to minimize extra storage
104  // and number of vectorized calls. Verified it with the autograd package
105  // in python.
106 
107  // a = -1/(HW) sum_j dl/dy_j * (x_j - mu) / stdev^3
108  const auto temp = (inv_stdev_arr.pow(3) *
109  (input_grad_mat * output_grad_mat).rowwise().mean() *
110  -1).eval();
111  // b_j = a * (x_j - mu)
112  input_grad_mat.colwise() *= temp;
113 
114  // c_j = b_j + dl/dy_j / stdev
115  input_grad_mat += output_grad_mat.colwise() * inv_stdev_arr;
116 
117  // dl/dx_j = s * (c_j - mean(c_j))
118  const auto result_mean = input_grad_mat.rowwise().mean().eval();
119  input_grad_mat.colwise() -= result_mean;
120  input_grad_mat.colwise() *= scale_arr;
121  }
122 
123  return true;
124 }
125 
126 template <typename T, typename Context>
127 bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
128  const auto& input = Input(INPUT);
129  const auto& scale = Input(SCALE);
130  const auto& bias = Input(BIAS);
131  const auto& output_grad = Input(OUTPUT_GRAD);
132  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
133  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
134  auto input_grad = Output(INPUT_GRAD);
135  auto scale_grad = Output(SCALE_GRAD);
136  auto bias_grad = Output(BIAS_GRAD);
137  CAFFE_ENFORCE_EQ(4, input.ndim());
138  const int N = input.dim32(0);
139  const int C = input.dim32(1);
140  const int H = input.dim32(2);
141  const int W = input.dim32(3);
142  CAFFE_ENFORCE_EQ(1, scale.ndim());
143  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
144  CAFFE_ENFORCE_EQ(1, bias.ndim());
145  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
146  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
147  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
148  CAFFE_ENFORCE_EQ(C, output_grad.dim32(1));
149  CAFFE_ENFORCE_EQ(H, output_grad.dim32(2));
150  CAFFE_ENFORCE_EQ(W, output_grad.dim32(3));
151  input_grad->ResizeLike(input);
152  scale_grad->ResizeLike(scale);
153  bias_grad->ResizeLike(bias);
154 
155  ConstEigenArrayMap<T> input_mat(input.template data<T>(), H * W, N * C);
156  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
157  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
158  ConstEigenArrayMap<T> output_grad_mat(
159  output_grad.template data<T>(), H * W, N * C);
160 
161  EigenArrayMap<T> input_grad_mat(
162  input_grad->template mutable_data<T>(), H * W, N * C);
163  EigenVectorArrayMap<T> scale_grad_arr(
164  scale_grad->template mutable_data<T>(), C);
165  EigenVectorArrayMap<T> bias_grad_arr(
166  bias_grad->template mutable_data<T>(), C);
167 
168  // Compute mean if it wasn't passed in
169  if (InputSize() < 5) {
170  mean_.Resize(N, C);
171  EigenVectorArrayMap<T> mean_mutable_arr(
172  mean_.template mutable_data<T>(), N * C);
173  mean_mutable_arr = input_mat.colwise().mean();
174  }
175  CAFFE_ENFORCE_EQ(2, mean.ndim());
176  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
177  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
178  ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>(), N * C);
179 
180  // subtract mean
181  input_grad_mat = input_mat.rowwise() - mean_arr.transpose();
182 
183  // compute 1 / stdev if not passed in
184  if (InputSize() < 6) {
185  inv_stdev_.Resize(N, C);
186  EigenVectorArrayMap<T> inv_stdev_mutable_arr(
187  inv_stdev_.template mutable_data<T>(), N * C);
188 
189  // Square the diffs along each column and take mean to get var
190  inv_stdev_mutable_arr = input_grad_mat.pow(2).colwise().mean();
191  // sqrt to get stdev and then invert
192  inv_stdev_mutable_arr = (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
193  }
194  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
195  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
196  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
197 
198  ConstEigenVectorArrayMap<T> inv_stdev_arr(
199  inv_stdev.template data<T>(), N * C);
200 
201  // Visit comments in the NHWC version about these gradients. scale and bias
202  // grads are about the same, but the input grads no longer slice out one
203  // example at a time and instead vectorize across all N * C feature maps.
204 
205  // scale and bias gradients
206  scale_grad_arr.setZero();
207  bias_grad_arr.setZero();
208  for (int n = 0; n < N; ++n) {
209  scale_grad_arr += ((input_grad_mat.rowwise() * inv_stdev_arr.transpose()) *
210  output_grad_mat)
211  .block(0, n * C, H * W, C)
212  .colwise()
213  .sum();
214  bias_grad_arr += output_grad_mat.block(0, n * C, H * W, C).colwise().sum();
215  }
216 
217  // input gradient
218  const auto temp = ((inv_stdev_arr.pow(3).transpose() *
219  (input_grad_mat * output_grad_mat).colwise().mean()) *
220  -1).eval();
221  input_grad_mat.rowwise() *= temp;
222 
223  input_grad_mat += output_grad_mat.rowwise() * inv_stdev_arr.transpose();
224 
225  const auto result_mean = input_grad_mat.colwise().mean().eval();
226  input_grad_mat.rowwise() -= result_mean;
227 
228  for (int n = 0; n < N; ++n) {
229  input_grad_mat.block(0, n * C, H * W, C).rowwise() *= scale_arr.transpose();
230  }
231 
232  return true;
233 }
234 
236  using GradientMakerBase::GradientMakerBase;
237  vector<OperatorDef> GetGradientDefs() override {
238  vector<string> inputs{I(0), I(1), I(2), GO(0)};
239  if (def_.output_size() >= 2) {
240  inputs.push_back(O(1));
241  }
242  if (def_.output_size() >= 3) {
243  inputs.push_back(O(2));
244  }
245  return SingleGradientDef(
246  "InstanceNormGradient",
247  "",
248  inputs,
249  vector<string>{GI(0), GI(1), GI(2)});
250  }
251 };
252 
253 REGISTER_CPU_OPERATOR(
254  InstanceNormGradient,
256 
257 OPERATOR_SCHEMA(InstanceNormGradient).NumInputs(4, 6).NumOutputs(3);
258 
259 REGISTER_GRADIENT(InstanceNorm, GetInstanceNormGradient);
260 }
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...