Caffe2 - C++ API
A deep learning, cross platform ML framework
local_response_normalization_op.cc
1 #include "caffe2/operators/local_response_normalization_op.h"
2 
3 namespace caffe2 {
4 
5 template<>
6 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
7  // Note(Yangqing): this one is copied from my Caffe implementation.
8  auto& X = Input(0);
9  auto* Y = Output(0);
10  DCHECK_EQ(X.ndim(), 4);
11  const int N = X.dim32(0);
12  const int C = X.dim32(1);
13  const int H = X.dim32(2);
14  const int W = X.dim32(3);
15  const int image_size = C * H * W;
16  const float* Xdata = X.data<float>();
17  Y->ResizeLike(X);
18  float* Ydata = Y->mutable_data<float>();
19 
20  if (OutputSize() > 1) {
21  scale_ = Output(1);
22  } else {
23  if (!scale_) {
24  scale_ = &local_scale_tensor_;
25  }
26  }
27  scale_->ResizeLike(X);
28  float* scale_data = scale_->mutable_data<float>();
29  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
30  TensorCPU padded_square(
31  vector<TIndex>{C + size_ - 1, H, W});
32  float* padded_square_data = padded_square.mutable_data<float>();
33  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
34  &context_);
35  const float alpha_over_size = alpha_ / size_;
36  // go through the images
37  for (int n = 0; n < N; ++n) {
38  // compute the padded square
39  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
40  padded_square_data + pre_pad_ * H * W,
41  &context_);
42  // Create the first channel scale
43  for (int c = 0; c < size_; ++c) {
44  math::Axpy<float, CPUContext>(
45  H * W, alpha_over_size, padded_square_data + c * H * W,
46  scale_data + image_size * n, &context_);
47  }
48  for (int c = 1; c < C; ++c) {
49  float* this_scale_slice = scale_data + n * image_size + c * H * W;
50  // copy previous scale
51  context_.Copy<float, CPUContext, CPUContext>(
52  H * W, this_scale_slice - H * W, this_scale_slice);
53  // add head
54  math::Axpy<float, CPUContext>(
55  H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
56  this_scale_slice, &context_);
57  // subtract tail
58  math::Axpy<float, CPUContext>(
59  H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
60  this_scale_slice, &context_);
61  }
62  }
63  math::Powx<float, CPUContext>(
64  X.size(), scale_data, -beta_, Ydata, &context_);
65  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
66  return true;
67 }
68 
69 template<>
70 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
71  // Note(Yangqing): This one is copied from my Decaf implementation. How many
72  // variants have I written...?
73  auto& X = Input(0);
74  auto* Y = Output(0);
75  DCHECK_EQ(X.ndim(), 4);
76  const int N = X.dim32(0);
77  const int H = X.dim32(1);
78  const int W = X.dim32(2);
79  const int C = X.dim32(3);
80  const int num_rows = N * H * W;
81  const float* Xdata = X.data<float>();
82  Y->ResizeLike(X);
83  float* Ydata = Y->mutable_data<float>();
84 
85  if (OutputSize() > 1) {
86  scale_ = Output(1);
87  } else {
88  if (!scale_) {
89  scale_ = &local_scale_tensor_;
90  }
91  }
92  scale_->ResizeLike(X);
93  float* scale_data = scale_->mutable_data<float>();
94 
95  TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
96  float* padded_square_data = padded_square.mutable_data<float>();
97  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
98  &context_);
99  const float alpha_over_size = alpha_ / size_;
100 
101  for (int n = 0; n < num_rows; ++n) {
102  for (int c = 0; c < C; ++c) {
103  padded_square_data[c + pre_pad_] =
104  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
105  }
106  float accum_scale = 0.;
107  for (int i = 0; i < size_ - 1; ++i) {
108  accum_scale += padded_square_data[i];
109  }
110  for (int c = 0; c < C; ++c) {
111  accum_scale += padded_square_data[c + size_ - 1];
112  scale_data[n * C + c] = bias_ + accum_scale;
113  accum_scale -= padded_square_data[c];
114  }
115  }
116  math::Powx<float, CPUContext>(
117  X.size(), scale_data, -beta_, Ydata, &context_);
118  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
119  return true;
120 }
121 
122 template <>
123 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
124  auto& X = Input(0);
125  auto& Y = Input(1);
126  auto& dY = Input(2);
127  auto* dX = Output(0);
128  DCHECK_EQ(X.ndim(), 4);
129  const int N = X.dim32(0);
130  const int C = X.dim32(1);
131  const int H = X.dim32(2);
132  const int W = X.dim32(3);
133  const int image_size = C * H * W;
134  // Loosely checking the size, assuming that the shapes will be the same as
135  // long as the sizes check out.
136  DCHECK_EQ(X.size(), Y.size());
137  DCHECK_EQ(X.size(), dY.size());
138  dX->ResizeLike(X);
139 
140  const float* Xdata = X.data<float>();
141  const float* Ydata = Y.data<float>();
142  if (!scale_) {
143  scale_ = &local_scale_tensor_;
144  }
145  scale_->ResizeLike(X);
146  float* scale_data = scale_->mutable_data<float>();
147  const float* dYdata = dY.data<float>();
148  float* dXdata = dX->mutable_data<float>();
149 
150  TensorCPU padded_ratio(
151  vector<TIndex>{C + size_ - 1, H, W});
152  float* padded_ratio_data = padded_ratio.mutable_data<float>();
153  // Compute scale(copied from LRNOp) - reusing padded_ratio
154  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
155  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
156  &context_);
157  const float alpha_over_size = alpha_ / size_;
158  // go through the images
159  for (int n = 0; n < N; ++n) {
160  // compute the padded square
161  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
162  padded_ratio_data + pre_pad_ * H * W,
163  &context_);
164  // Create the first channel scale
165  for (int c = 0; c < size_; ++c) {
166  math::Axpy<float, CPUContext>(
167  H * W, alpha_over_size, padded_ratio_data + c * H * W,
168  scale_data + image_size * n, &context_);
169  }
170  for (int c = 1; c < C; ++c) {
171  float* this_scale_slice = scale_data + n * image_size + c * H * W;
172  // copy previous scale
173  context_.Copy<float, CPUContext, CPUContext>(
174  H * W, this_scale_slice - H * W, this_scale_slice);
175  // add head
176  math::Axpy<float, CPUContext>(
177  H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
178  this_scale_slice, &context_);
179  // subtract tail
180  math::Axpy<float, CPUContext>(
181  H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
182  this_scale_slice, &context_);
183  }
184  }
185 
186  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
187  &context_);
188  TensorCPU accum_ratio(vector<TIndex>{H, W});
189  float* accum_ratio_data = accum_ratio.mutable_data<float>();
190 
191 
192  const float cache_ratio = 2. * alpha_ * beta_ / size_;
193  const int inverse_pre_pad = size_ - (size_ + 1) / 2;
194 
195  int offset = 0;
196  for (int n = 0; n < N; ++n) {
197  // first, compute diff_i * y_i / s_i
198  math::Mul<float, CPUContext>(
199  image_size, dYdata + offset, Ydata + offset,
200  padded_ratio_data + inverse_pre_pad * H * W, &context_);
201  math::Div<float, CPUContext>(
202  image_size, padded_ratio_data + inverse_pre_pad * H * W,
203  scale_data + offset,
204  padded_ratio_data + inverse_pre_pad * H * W, &context_);
205  // Now, compute the accumulated ratios and the bottom diff
206  math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
207  &context_);
208  for (int c = 0; c < size_ - 1; ++c) {
209  math::Axpy<float, CPUContext>(H * W, 1,
210  padded_ratio_data + c * H * W,
211  accum_ratio_data, &context_);
212  }
213  for (int c = 0; c < C; ++c) {
214  for (int hw = 0; hw < H * W; ++hw) {
215  accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
216  dXdata[offset] =
217  dYdata[offset] * pow(scale_data[offset], -beta_) -
218  cache_ratio * accum_ratio_data[hw] * Xdata[offset];
219  accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
220  ++offset;
221  }
222  }
223  }
224  return true;
225 }
226 
227 template <>
228 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
229  auto& X = Input(0);
230  auto& Y = Input(1);
231  auto& dY = Input(2);
232  auto* dX = Output(0);
233  DCHECK_EQ(X.ndim(), 4);
234  const int N = X.dim32(0);
235  const int H = X.dim32(1);
236  const int W = X.dim32(2);
237  const int C = X.dim32(3);
238  const int num_rows = N * H * W;
239  const float* Xdata = X.data<float>();
240  // Loosely checking the size, assuming that the shapes will be the same as
241  // long as the sizes check out.
242  DCHECK_EQ(X.size(), Y.size());
243  DCHECK_EQ(X.size(), dY.size());
244  dX->ResizeLike(X);
245  if (!scale_) {
246  scale_ = &local_scale_tensor_;
247  }
248  scale_->ResizeLike(X);
249  TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
250  float* padded_ratio_data = padded_ratio.mutable_data<float>();
251  float* scale_data = scale_->mutable_data<float>();
252  // Compute scale(copied from LRNOp) - reusing padded_ratio
253  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
254  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
255  &context_);
256  const float alpha_over_size = alpha_ / size_;
257 
258  for (int n = 0; n < num_rows; ++n) {
259  for (int c = 0; c < C; ++c) {
260  padded_ratio_data[c + pre_pad_] =
261  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
262  }
263  float accum_scale = 0.;
264  for (int i = 0; i < size_ - 1; ++i) {
265  accum_scale += padded_ratio_data[i];
266  }
267  for (int c = 0; c < C; ++c) {
268  accum_scale += padded_ratio_data[c + size_ - 1];
269  scale_data[n * C + c] = bias_ + accum_scale;
270  accum_scale -= padded_ratio_data[c];
271  }
272  }
273 
274  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
275  &context_);
276  // the ratio 2*alpha*beta/size
277  const float cache_ratio = 2. * alpha_ * beta_ / size_;
278  const float* Ydata = Y.data<float>();
279 
280  const float* dYdata = dY.data<float>();
281  float* dXdata = dX->mutable_data<float>();
282  for (int n = 0; n < num_rows; ++n) {
283  const int offset = n * C;
284  for (int c = 0; c < C; ++c) {
285  padded_ratio_data[c + pre_pad_] =
286  Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
287  }
288  float accum_ratio = 0.;
289  for (int c = 0; c < size_ - 1; ++c) {
290  accum_ratio += padded_ratio_data[c];
291  }
292  for (int c = 0; c < C; ++c) {
293  accum_ratio += padded_ratio_data[c + size_ - 1];
294  dXdata[offset + c] =
295  dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
296  cache_ratio * Xdata[offset + c] * accum_ratio;
297  accum_ratio -= padded_ratio_data[c];
298  }
299  }
300  return true;
301 }
302 
303 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
304 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
305 
306 OPERATOR_SCHEMA(LRN).NumInputs(1).NumOutputs(1, 2).InheritOnnxSchema("LRN");
307 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
308 
310  using GradientMakerBase::GradientMakerBase;
311  vector<OperatorDef> GetGradientDefs() override {
312  return SingleGradientDef(
313  "LRNGradient", "",
314  vector<string>{I(0), O(0), GO(0)},
315  vector<string>{GI(0)});
316  }
317 };
318 REGISTER_GRADIENT(LRN, GetLRNGradient);
319 } // namespace caffe2
T * mutable_data()
Returns a typed pointer of the underlying storage.
Definition: tensor.h:578
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...