1 #include "caffe2/operators/prelu_op.h" 2 #include "caffe2/utils/math.h" 4 #include "caffe2/core/types.h" 5 #include "caffe2/utils/cpu_neon.h" 12 void runNeonPrelu(
float* out,
const float* in,
int size,
float w) {
13 float32x4_t vZero = vdupq_n_f32(0.0f);
14 float32x4_t vW = vdupq_n_f32(w);
16 constexpr
int kVecSizeInFloat =
sizeof(float32x4_t) /
sizeof(
float);
18 if (size < kVecSizeInFloat) {
19 for (
int i = 0; i < size; ++i) {
21 out[i] = v > 0 ? v : v * w;
31 (((uintptr_t) in) % (
sizeof(float32x4_t))) /
sizeof(
float);
36 for (; i < prologue; ++i) {
38 out[i] = v > 0 ? v : v * w;
43 constexpr
int kUnroll = 6;
44 constexpr
int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
46 int remainder = size - prologue;
47 int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
49 for (; i < vectorizable; i += kFloatsPerLoop) {
50 float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
51 float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
52 float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
53 float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
54 float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
55 float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
57 uint32x4_t gz0 = vcgtq_f32(v0, vZero);
58 uint32x4_t gz1 = vcgtq_f32(v1, vZero);
59 uint32x4_t gz2 = vcgtq_f32(v2, vZero);
60 uint32x4_t gz3 = vcgtq_f32(v3, vZero);
61 uint32x4_t gz4 = vcgtq_f32(v4, vZero);
62 uint32x4_t gz5 = vcgtq_f32(v5, vZero);
64 float32x4_t v0neg = vmulq_f32(v0, vW);
65 float32x4_t v1neg = vmulq_f32(v1, vW);
66 float32x4_t v2neg = vmulq_f32(v2, vW);
67 float32x4_t v3neg = vmulq_f32(v3, vW);
68 float32x4_t v4neg = vmulq_f32(v4, vW);
69 float32x4_t v5neg = vmulq_f32(v5, vW);
72 v0 = vbslq_f32(gz0, v0, v0neg);
73 v1 = vbslq_f32(gz1, v1, v1neg);
74 v2 = vbslq_f32(gz2, v2, v2neg);
75 v3 = vbslq_f32(gz3, v3, v3neg);
76 v4 = vbslq_f32(gz4, v4, v4neg);
77 v5 = vbslq_f32(gz5, v5, v5neg);
79 vst1q_f32(out + i + 0, v0);
80 vst1q_f32(out + i + 4, v1);
81 vst1q_f32(out + i + 8, v2);
82 vst1q_f32(out + i + 12, v3);
83 vst1q_f32(out + i + 16, v4);
84 vst1q_f32(out + i + 20, v5);
87 for (; i < size; ++i) {
89 out[i] = v > 0 ? v : v * w;
94 #endif // __ARM_NEON__ 97 bool PReluOp<float, CPUContext>::RunOnDevice() {
98 const auto& X = Input(0);
99 const auto& W = Input(1);
102 const auto* Xdata = X.template data<float>();
103 const auto* Wdata = W.template data<float>();
104 auto* Ydata = Y->template mutable_data<float>();
106 const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
107 const auto C_shared = (W.size() == 1);
110 CAFFE_ENFORCE_EQ(C, W.size());
116 runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
118 ConstEigenVectorMap<float> Xvec(Xdata, X.size());
119 EigenVectorMap<float> Yvec(Ydata, Y->size());
120 Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
121 #endif // __ARM_NEON__ 127 case StorageOrder::NCHW: {
128 const auto N = X.dim(0);
129 const auto dim = X.size_from_dim(2);
133 for (
int n = 0; n < N; ++n) {
134 for (
int c = 0; c < C; ++c) {
135 runNeonPrelu(Ydata + (n * C + c) * dim,
136 Xdata + (n * C + c) * dim,
142 for (
int n = 0; n < N; ++n) {
143 for (
int c = 0; c < C; ++c) {
144 ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
145 EigenVectorMap<float>(Ydata + nc * dim, dim) =
146 Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
153 case StorageOrder::NHWC: {
155 const auto NHW = X.size() / C;
156 ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
157 ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
158 EigenArrayMap<float> Ymat(Ydata, C, NHW);
159 Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
163 CAFFE_THROW(
"Unknown storage order: ", order_);
169 bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
175 CAFFE_ENFORCE(&Y != &X,
"Cannot backpropagate through an in-place PReLU");
176 auto* dX = Output(0);
177 auto* dW = Output(1);
179 DCHECK_EQ(dY.size(), Y.size());
183 const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
184 const auto C_shared = (W.size() == 1);
186 const float* Ydata = Y.data<
float>();
187 const float* dYdata = dY.data<
float>();
188 const float* Xdata = X.data<
float>();
189 const float* Wdata = W.data<
float>();
190 float* dXdata = dX->mutable_data<
float>();
191 float* dWdata = dW->mutable_data<
float>();
195 case StorageOrder::NCHW: {
196 const auto dim = X.size_from_dim(2);
197 const auto div_factor = C_shared ? C : 1;
198 for (
auto c = 0; c < W.size(); ++c) {
202 for (
int i = 0; i < Y.size(); ++i) {
204 int c = (i / dim) % C / div_factor;
205 dWdata[c] += dYdata[i] * Xdata[i];
209 for (
int i = 0; i < Y.size(); ++i) {
211 dXdata[i] = dYdata[i];
213 int c = (i / dim) % C / div_factor;
214 dXdata[i] = Wdata[c] * dYdata[i];
219 case StorageOrder::NHWC: {
220 const auto NHW = X.size() / C;
221 ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
222 EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
224 ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
225 ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
226 ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
227 EigenArrayMap<float> dXmat(dXdata, C, NHW);
230 dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
238 dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
249 CAFFE_THROW(
"Unknown storage order: ", order_);
255 REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
256 REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
259 OPERATOR_SCHEMA(PRelu)
262 .AllowInplace({{0, 0}})
263 .IdenticalTypeAndShapeOfInput(0)
266 PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one 267 output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`, 268 `f(x) = x for x >= 0`., is applied to the data tensor elementwise. 271 .Input(0, "X",
"1D input tensor")
275 "1D slope tensor. If `Slope` is of size 1, the value is shared" 276 "across different channels")
277 .Output(0,
"Y",
"1D input tensor")
278 .InheritOnnxSchema(
"PRelu");
281 OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R
"DOC( 283 PReluGradient takes both Y and dY and uses this to update dX and dW according 284 to the chain rule and derivatives of the rectified linear function. 289 using GradientMakerBase::GradientMakerBase;
290 vector<OperatorDef> GetGradientDefs()
override {
292 def_.type() +
"Gradient",
294 vector<string>{O(0), GO(0), I(0), I(1)},
295 vector<string>{GI(0), GI(1)});
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...