1 #include "caffe2/operators/local_response_normalization_op.h" 6 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
10 DCHECK_EQ(X.ndim(), 4);
11 const int N = X.dim32(0);
12 const int C = X.dim32(1);
13 const int H = X.dim32(2);
14 const int W = X.dim32(3);
15 const int image_size = C * H * W;
16 const float* Xdata = X.data<
float>();
18 float* Ydata = Y->mutable_data<
float>();
20 if (OutputSize() > 1) {
24 scale_ = &local_scale_tensor_;
27 scale_->ResizeLike(X);
28 float* scale_data = scale_->mutable_data<
float>();
29 math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
30 TensorCPU padded_square(
31 vector<TIndex>{C + size_ - 1, H, W});
32 float* padded_square_data = padded_square.
mutable_data<
float>();
33 math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
35 const float alpha_over_size = alpha_ / size_;
37 for (
int n = 0; n < N; ++n) {
39 math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
40 padded_square_data + pre_pad_ * H * W,
43 for (
int c = 0; c < size_; ++c) {
44 math::Axpy<float, CPUContext>(
45 H * W, alpha_over_size, padded_square_data + c * H * W,
46 scale_data + image_size * n, &context_);
48 for (
int c = 1; c < C; ++c) {
49 float* this_scale_slice = scale_data + n * image_size + c * H * W;
51 context_.Copy<float, CPUContext, CPUContext>(
52 H * W, this_scale_slice - H * W, this_scale_slice);
54 math::Axpy<float, CPUContext>(
55 H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
56 this_scale_slice, &context_);
58 math::Axpy<float, CPUContext>(
59 H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
60 this_scale_slice, &context_);
63 math::Powx<float, CPUContext>(
64 X.size(), scale_data, -beta_, Ydata, &context_);
65 math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
70 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
75 DCHECK_EQ(X.ndim(), 4);
76 const int N = X.dim32(0);
77 const int H = X.dim32(1);
78 const int W = X.dim32(2);
79 const int C = X.dim32(3);
80 const int num_rows = N * H * W;
81 const float* Xdata = X.data<
float>();
83 float* Ydata = Y->mutable_data<
float>();
85 if (OutputSize() > 1) {
89 scale_ = &local_scale_tensor_;
92 scale_->ResizeLike(X);
93 float* scale_data = scale_->mutable_data<
float>();
95 TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
96 float* padded_square_data = padded_square.mutable_data<
float>();
97 math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
99 const float alpha_over_size = alpha_ / size_;
101 for (
int n = 0; n < num_rows; ++n) {
102 for (
int c = 0; c < C; ++c) {
103 padded_square_data[c + pre_pad_] =
104 Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
106 float accum_scale = 0.;
107 for (
int i = 0; i < size_ - 1; ++i) {
108 accum_scale += padded_square_data[i];
110 for (
int c = 0; c < C; ++c) {
111 accum_scale += padded_square_data[c + size_ - 1];
112 scale_data[n * C + c] = bias_ + accum_scale;
113 accum_scale -= padded_square_data[c];
116 math::Powx<float, CPUContext>(
117 X.size(), scale_data, -beta_, Ydata, &context_);
118 math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
123 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
127 auto* dX = Output(0);
128 DCHECK_EQ(X.ndim(), 4);
129 const int N = X.dim32(0);
130 const int C = X.dim32(1);
131 const int H = X.dim32(2);
132 const int W = X.dim32(3);
133 const int image_size = C * H * W;
136 DCHECK_EQ(X.size(), Y.size());
137 DCHECK_EQ(X.size(), dY.size());
140 const float* Xdata = X.data<
float>();
141 const float* Ydata = Y.data<
float>();
143 scale_ = &local_scale_tensor_;
145 scale_->ResizeLike(X);
146 float* scale_data = scale_->mutable_data<
float>();
147 const float* dYdata = dY.data<
float>();
148 float* dXdata = dX->mutable_data<
float>();
150 TensorCPU padded_ratio(
151 vector<TIndex>{C + size_ - 1, H, W});
152 float* padded_ratio_data = padded_ratio.
mutable_data<
float>();
154 math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
155 math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
157 const float alpha_over_size = alpha_ / size_;
159 for (
int n = 0; n < N; ++n) {
161 math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
162 padded_ratio_data + pre_pad_ * H * W,
165 for (
int c = 0; c < size_; ++c) {
166 math::Axpy<float, CPUContext>(
167 H * W, alpha_over_size, padded_ratio_data + c * H * W,
168 scale_data + image_size * n, &context_);
170 for (
int c = 1; c < C; ++c) {
171 float* this_scale_slice = scale_data + n * image_size + c * H * W;
173 context_.Copy<float, CPUContext, CPUContext>(
174 H * W, this_scale_slice - H * W, this_scale_slice);
176 math::Axpy<float, CPUContext>(
177 H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
178 this_scale_slice, &context_);
180 math::Axpy<float, CPUContext>(
181 H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
182 this_scale_slice, &context_);
186 math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
188 TensorCPU accum_ratio(vector<TIndex>{H, W});
189 float* accum_ratio_data = accum_ratio.
mutable_data<
float>();
192 const float cache_ratio = 2. * alpha_ * beta_ / size_;
193 const int inverse_pre_pad = size_ - (size_ + 1) / 2;
196 for (
int n = 0; n < N; ++n) {
198 math::Mul<float, CPUContext>(
199 image_size, dYdata + offset, Ydata + offset,
200 padded_ratio_data + inverse_pre_pad * H * W, &context_);
201 math::Div<float, CPUContext>(
202 image_size, padded_ratio_data + inverse_pre_pad * H * W,
204 padded_ratio_data + inverse_pre_pad * H * W, &context_);
206 math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
208 for (
int c = 0; c < size_ - 1; ++c) {
209 math::Axpy<float, CPUContext>(H * W, 1,
210 padded_ratio_data + c * H * W,
211 accum_ratio_data, &context_);
213 for (
int c = 0; c < C; ++c) {
214 for (
int hw = 0; hw < H * W; ++hw) {
215 accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
217 dYdata[offset] * pow(scale_data[offset], -beta_) -
218 cache_ratio * accum_ratio_data[hw] * Xdata[offset];
219 accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
228 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
232 auto* dX = Output(0);
233 DCHECK_EQ(X.ndim(), 4);
234 const int N = X.dim32(0);
235 const int H = X.dim32(1);
236 const int W = X.dim32(2);
237 const int C = X.dim32(3);
238 const int num_rows = N * H * W;
239 const float* Xdata = X.data<
float>();
242 DCHECK_EQ(X.size(), Y.size());
243 DCHECK_EQ(X.size(), dY.size());
246 scale_ = &local_scale_tensor_;
248 scale_->ResizeLike(X);
249 TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
250 float* padded_ratio_data = padded_ratio.mutable_data<
float>();
251 float* scale_data = scale_->mutable_data<
float>();
253 math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
254 math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
256 const float alpha_over_size = alpha_ / size_;
258 for (
int n = 0; n < num_rows; ++n) {
259 for (
int c = 0; c < C; ++c) {
260 padded_ratio_data[c + pre_pad_] =
261 Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
263 float accum_scale = 0.;
264 for (
int i = 0; i < size_ - 1; ++i) {
265 accum_scale += padded_ratio_data[i];
267 for (
int c = 0; c < C; ++c) {
268 accum_scale += padded_ratio_data[c + size_ - 1];
269 scale_data[n * C + c] = bias_ + accum_scale;
270 accum_scale -= padded_ratio_data[c];
274 math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
277 const float cache_ratio = 2. * alpha_ * beta_ / size_;
278 const float* Ydata = Y.data<
float>();
280 const float* dYdata = dY.data<
float>();
281 float* dXdata = dX->mutable_data<
float>();
282 for (
int n = 0; n < num_rows; ++n) {
283 const int offset = n * C;
284 for (
int c = 0; c < C; ++c) {
285 padded_ratio_data[c + pre_pad_] =
286 Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
288 float accum_ratio = 0.;
289 for (
int c = 0; c < size_ - 1; ++c) {
290 accum_ratio += padded_ratio_data[c];
292 for (
int c = 0; c < C; ++c) {
293 accum_ratio += padded_ratio_data[c + size_ - 1];
295 dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
296 cache_ratio * Xdata[offset + c] * accum_ratio;
297 accum_ratio -= padded_ratio_data[c];
303 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
304 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
306 OPERATOR_SCHEMA(LRN).NumInputs(1).NumOutputs(1, 2).InheritOnnxSchema(
"LRN");
307 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
310 using GradientMakerBase::GradientMakerBase;
311 vector<OperatorDef> GetGradientDefs()
override {
314 vector<string>{I(0), O(0), GO(0)},
315 vector<string>{GI(0)});
T * mutable_data()
Returns a typed pointer of the underlying storage.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...