Caffe2 - C++ API
A deep learning, cross platform ML framework
prelu_op.cc
1 #include "caffe2/operators/prelu_op.h"
2 #include "caffe2/utils/math.h"
3 
4 #include "caffe2/core/types.h"
5 #include "caffe2/utils/cpu_neon.h"
6 
7 namespace caffe2 {
8 
9 #ifdef __ARM_NEON__
10 namespace {
11 
12 void runNeonPrelu(float* out, const float* in, int size, float w) {
13  float32x4_t vZero = vdupq_n_f32(0.0f);
14  float32x4_t vW = vdupq_n_f32(w);
15 
16  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
17 
18  if (size < kVecSizeInFloat) {
19  for (int i = 0; i < size; ++i) {
20  float v = in[i];
21  out[i] = v > 0 ? v : v * w;
22  }
23 
24  return;
25  }
26 
27  // We want to load aligned from the input, but assume the output is unaligned
28  int prologue =
29  kVecSizeInFloat -
30  // remainder in floats
31  (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
32 
33  int i = 0;
34 
35  // Prologue loop
36  for (; i < prologue; ++i) {
37  float v = in[i];
38  out[i] = v > 0 ? v : v * w;
39  }
40 
41  // The loop is manually unrolled by 6; seems to be the limit for
42  // armv7 to avoid register spills
43  constexpr int kUnroll = 6;
44  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
45 
46  int remainder = size - prologue;
47  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
48 
49  for (; i < vectorizable; i += kFloatsPerLoop) {
50  float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
51  float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
52  float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
53  float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
54  float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
55  float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
56 
57  uint32x4_t gz0 = vcgtq_f32(v0, vZero);
58  uint32x4_t gz1 = vcgtq_f32(v1, vZero);
59  uint32x4_t gz2 = vcgtq_f32(v2, vZero);
60  uint32x4_t gz3 = vcgtq_f32(v3, vZero);
61  uint32x4_t gz4 = vcgtq_f32(v4, vZero);
62  uint32x4_t gz5 = vcgtq_f32(v5, vZero);
63 
64  float32x4_t v0neg = vmulq_f32(v0, vW);
65  float32x4_t v1neg = vmulq_f32(v1, vW);
66  float32x4_t v2neg = vmulq_f32(v2, vW);
67  float32x4_t v3neg = vmulq_f32(v3, vW);
68  float32x4_t v4neg = vmulq_f32(v4, vW);
69  float32x4_t v5neg = vmulq_f32(v5, vW);
70 
71  // v0 > 0 ? v0 : v0 * w
72  v0 = vbslq_f32(gz0, v0, v0neg);
73  v1 = vbslq_f32(gz1, v1, v1neg);
74  v2 = vbslq_f32(gz2, v2, v2neg);
75  v3 = vbslq_f32(gz3, v3, v3neg);
76  v4 = vbslq_f32(gz4, v4, v4neg);
77  v5 = vbslq_f32(gz5, v5, v5neg);
78 
79  vst1q_f32(out + i + 0, v0);
80  vst1q_f32(out + i + 4, v1);
81  vst1q_f32(out + i + 8, v2);
82  vst1q_f32(out + i + 12, v3);
83  vst1q_f32(out + i + 16, v4);
84  vst1q_f32(out + i + 20, v5);
85  }
86 
87  for (; i < size; ++i) {
88  float v = in[i];
89  out[i] = v > 0 ? v : v * w;
90  }
91 }
92 
93 }
94 #endif // __ARM_NEON__
95 
96 template <>
97 bool PReluOp<float, CPUContext>::RunOnDevice() {
98  const auto& X = Input(0);
99  const auto& W = Input(1);
100  auto* Y = Output(0);
101  Y->ResizeLike(X);
102  const auto* Xdata = X.template data<float>();
103  const auto* Wdata = W.template data<float>();
104  auto* Ydata = Y->template mutable_data<float>();
105 
106  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
107  const auto C_shared = (W.size() == 1);
108 
109  if (!C_shared) {
110  CAFFE_ENFORCE_EQ(C, W.size());
111  }
112 
113  if (C_shared) {
114 #ifdef __ARM_NEON__
115  // The function is completely pointwise
116  runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
117 #else
118  ConstEigenVectorMap<float> Xvec(Xdata, X.size());
119  EigenVectorMap<float> Yvec(Ydata, Y->size());
120  Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
121 #endif // __ARM_NEON__
122  return true;
123  }
124 
125  // non-shared case.
126  switch (order_) {
127  case StorageOrder::NCHW: {
128  const auto N = X.dim(0);
129  const auto dim = X.size_from_dim(2);
130 
131 #ifdef __ARM_NEON__
132  // Pointwise for each channel
133  for (int n = 0; n < N; ++n) {
134  for (int c = 0; c < C; ++c) {
135  runNeonPrelu(Ydata + (n * C + c) * dim,
136  Xdata + (n * C + c) * dim,
137  dim, Wdata[c]);
138  }
139  }
140 #else
141  int nc = 0;
142  for (int n = 0; n < N; ++n) {
143  for (int c = 0; c < C; ++c) {
144  ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
145  EigenVectorMap<float>(Ydata + nc * dim, dim) =
146  Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
147  nc++;
148  }
149  }
150 #endif
151  break;
152  }
153  case StorageOrder::NHWC: {
154  // Lay out matrix as (NHW, C) and multiply by C
155  const auto NHW = X.size() / C;
156  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
157  ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
158  EigenArrayMap<float> Ymat(Ydata, C, NHW);
159  Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
160  break;
161  }
162  default:
163  CAFFE_THROW("Unknown storage order: ", order_);
164  }
165  return true;
166 }
167 
168 template <>
169 bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
170  auto& Y = Input(0);
171  auto& dY = Input(1);
172  auto& X = Input(2);
173  auto& W = Input(3);
174 
175  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
176  auto* dX = Output(0);
177  auto* dW = Output(1);
178 
179  DCHECK_EQ(dY.size(), Y.size());
180  dX->ResizeLike(Y);
181  dW->ResizeLike(W);
182 
183  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
184  const auto C_shared = (W.size() == 1);
185 
186  const float* Ydata = Y.data<float>();
187  const float* dYdata = dY.data<float>();
188  const float* Xdata = X.data<float>();
189  const float* Wdata = W.data<float>();
190  float* dXdata = dX->mutable_data<float>();
191  float* dWdata = dW->mutable_data<float>();
192 
193  // non-shared case.
194  switch (order_) {
195  case StorageOrder::NCHW: {
196  const auto dim = X.size_from_dim(2);
197  const auto div_factor = C_shared ? C : 1;
198  for (auto c = 0; c < W.size(); ++c) {
199  dWdata[c] = 0;
200  }
201 
202  for (int i = 0; i < Y.size(); ++i) {
203  if (Xdata[i] <= 0) {
204  int c = (i / dim) % C / div_factor;
205  dWdata[c] += dYdata[i] * Xdata[i];
206  }
207  }
208 
209  for (int i = 0; i < Y.size(); ++i) {
210  if (Xdata[i] > 0) {
211  dXdata[i] = dYdata[i];
212  } else {
213  int c = (i / dim) % C / div_factor;
214  dXdata[i] = Wdata[c] * dYdata[i];
215  }
216  }
217  break;
218  }
219  case StorageOrder::NHWC: {
220  const auto NHW = X.size() / C;
221  ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
222  EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
223 
224  ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
225  ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
226  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
227  EigenArrayMap<float> dXmat(dXdata, C, NHW);
228 
229  if (C_shared) {
230  dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
231  dWdata[0] =
232  (Xmat > 0)
233  .select(
234  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
235  dYmat * Xmat)
236  .sum();
237  } else {
238  dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
239  dWvec = (Xmat > 0)
240  .select(
241  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
242  dYmat * Xmat)
243  .rowwise()
244  .sum();
245  }
246  break;
247  }
248  default:
249  CAFFE_THROW("Unknown storage order: ", order_);
250  }
251 
252  return true;
253 }
254 
255 REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
256 REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
257 
258 // Input: X, Slope, output: Y
259 OPERATOR_SCHEMA(PRelu)
260  .NumInputs(2)
261  .NumOutputs(1)
262  .AllowInplace({{0, 0}})
263  .IdenticalTypeAndShapeOfInput(0)
264  .SetDoc(R"DOC(
265 
266 PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
267 output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
268 `f(x) = x for x >= 0`., is applied to the data tensor elementwise.
269 
270 )DOC")
271  .Input(0, "X", "1D input tensor")
272  .Input(
273  1,
274  "Slope",
275  "1D slope tensor. If `Slope` is of size 1, the value is shared"
276  "across different channels")
277  .Output(0, "Y", "1D input tensor")
278  .InheritOnnxSchema("PRelu");
279 
280 // Input: Y, dY, output: dX
281 OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
282 
283 PReluGradient takes both Y and dY and uses this to update dX and dW according
284 to the chain rule and derivatives of the rectified linear function.
285 
286 )DOC");
287 
289  using GradientMakerBase::GradientMakerBase;
290  vector<OperatorDef> GetGradientDefs() override {
291  return SingleGradientDef(
292  def_.type() + "Gradient",
293  "",
294  vector<string>{O(0), GO(0), I(0), I(1)},
295  vector<string>{GI(0), GI(1)});
296  }
297 };
298 REGISTER_GRADIENT(PRelu, GetPReluGradient);
299 
300 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...