Caffe2 - C++ API
A deep learning, cross platform ML framework
instance_norm_op.cc
1 #include "caffe2/operators/instance_norm_op.h"
2 
3 namespace caffe2 {
4 
5 // Here lives two separate implementations of the forward and backward passes of
6 // instance normalization, one for NHWC order and the other for NCHW order.
7 // Two implementations allow us to make use of Eigen vectorized operations
8 // without an expensive tensor transpose operation.
9 
10 template <typename T, typename Context>
11 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNHWC() {
12  const auto& X = Input(INPUT);
13  auto* Y = Output(OUTPUT);
14  CAFFE_ENFORCE(Y != &X, "Can't run InstanceNorm NHWC in-place");
15  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
16  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
17  const int N = X.dim32(0);
18  const int H = X.dim32(1);
19  const int W = X.dim32(2);
20  const int C = X.dim32(3);
21  const size_t offset = H * W * C;
22 
23  CAFFE_ENFORCE_EQ(Input(SCALE).size(), C);
24  CAFFE_ENFORCE_EQ(Input(BIAS).size(), C);
25 
26  Y->ResizeLike(X);
27  mean->Resize(N, C);
28  inv_stdev->Resize(N, C);
29  ConstEigenVectorArrayMap<T> scale(Input(SCALE).template data<T>(), C);
30  ConstEigenVectorArrayMap<T> bias(Input(BIAS).template data<T>(), C);
31  for (int n = 0; n < N; ++n) {
32  ConstEigenArrayMap<T> Xmat(X.template data<T>() + offset * n, C, H * W);
33  EigenArrayMap<T> Ymat(Y->template mutable_data<T>() + offset * n, C, H * W);
34  EigenVectorArrayMap<T> mean_arr(
35  mean->template mutable_data<T>() + n * C, C);
36  EigenVectorArrayMap<T> inv_stdev_arr(
37  inv_stdev->template mutable_data<T>() + n * C, C);
38 
39  // The following effectively does the row wise mean computation:
40  // mean_arr = Xmat.rowwise().mean();
41  // but manually vectorizes over columns.
42  mean_arr = Xmat.col(0);
43  for (int i = 1; i < H * W; ++i) {
44  mean_arr += Xmat.col(i);
45  }
46  mean_arr *= 1. / (H * W);
47  Ymat = Xmat.colwise() - mean_arr;
48  // The following effectively does row wise squared norm computation,
49  // but manually vectorizes over columns similar to the mean case.
50  inv_stdev_arr = Ymat.col(0) * Ymat.col(0);
51  for (int i = 1; i < H * W; ++i) {
52  inv_stdev_arr += Ymat.col(i) * Ymat.col(i);
53  }
54  inv_stdev_arr = (inv_stdev_arr / (H * W) + epsilon_).sqrt().inverse();
55  Ymat = (Ymat.colwise() * (inv_stdev_arr * scale)).colwise() + bias;
56  }
57  return true;
58 }
59 
60 template <typename T, typename Context>
61 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNCHW() {
62  const auto& X = Input(INPUT);
63  const auto& scale = Input(SCALE);
64  const auto& bias = Input(BIAS);
65  auto* Y = Output(OUTPUT);
66  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
67  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
68  const int N = X.dim32(0);
69  const int C = X.dim32(1);
70  const int H = X.dim32(2);
71  const int W = X.dim32(3);
72 
73  CAFFE_ENFORCE_EQ(scale.size(), C);
74  CAFFE_ENFORCE_EQ(bias.size(), C);
75 
76  Y->ResizeLike(X);
77  mean->Resize(N, C);
78  inv_stdev->Resize(N, C);
79 
80  const auto* Xdata = X.template data<T>();
81  auto* Ydata = Y->template mutable_data<T>();
82  const auto* scale_data = scale.template data<T>();
83  const auto* bias_data = bias.template data<T>();
84  auto* mean_data = mean->template mutable_data<T>();
85  auto* inv_stdev_data = inv_stdev->template mutable_data<T>();
86 
87  // TODO: benchmark parallelization strategies.
88  for (auto i = 0; i < N * C; ++i) {
89  ConstEigenVectorArrayMap<T> Xi(Xdata + H * W * i, H * W);
90  const T Xi_mean = Xi.mean();
91  const T squared_norm = (Xi - Xi_mean).matrix().squaredNorm();
92  const T inv_stdev = 1.0 / std::sqrt(squared_norm / (H * W) + epsilon_);
93  mean_data[i] = Xi_mean;
94  inv_stdev_data[i] = inv_stdev;
95  EigenVectorArrayMap<T> Yi(Ydata + H * W * i, H * W);
96  const T channel_scale = inv_stdev * scale_data[i % C];
97  const T channel_shift = bias_data[i % C] - Xi_mean * channel_scale;
98  Yi = Xi * channel_scale + channel_shift;
99  }
100 
101  return true;
102 }
103 
104 REGISTER_CPU_OPERATOR(InstanceNorm, InstanceNormOp<float, CPUContext>);
105 
106 OPERATOR_SCHEMA(InstanceNorm)
107  .NumInputs(3)
108  .NumOutputs(1, 3)
109  .AllowInplace({{0,0}})
110  .SetDoc(R"DOC(
111 Carries out instance normalization as described in the paper
112 https://arxiv.org/abs/1607.08022. Depending on the mode it is being run,
113 there are multiple cases for the number of outputs, which we list below:
114 
115  * Output case #1: output
116  * Output case #2: output, saved_mean
117  - don't use, doesn't make sense but won't crash
118  * Output case #3: output, saved_mean, saved_inv_stdev
119  - Makes sense for training only
120 
121 For training mode, type 3 is faster in the sense that for the backward
122 pass, it is able to reuse the saved mean and inv_stdev in the gradient
123 computation.
124 )DOC")
125  .Arg("epsilon", "The epsilon value to use to avoid division by zero.")
126  .Arg("order", "A StorageOrder string.")
127  .Input(
128  0,
129  "input",
130  "The input 4-dimensional tensor of shape NCHW or NHWC depending "
131  "on the order parameter.")
132  .Input(1, "scale", "The input 1-dimensional scale tensor of size C.")
133  .Input(2, "bias", "The input 1-dimensional bias tensor of size C.")
134  .Output(
135  0,
136  "output",
137  "The output 4-dimensional tensor of the same shape as input.")
138  .Output(
139  1,
140  "saved_mean",
141  "Optional saved mean used during training to speed up gradient "
142  "computation. Should not be used for testing.")
143  .Output(
144  2,
145  "saved_inv_stdev",
146  "Optional saved inverse stdev used during training to speed up "
147  "gradient computation. Should not be used for testing.");
148 
149 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...