Caffe2 - C++ API
A deep learning, cross platform ML framework
lars_op.cc
1 #include "caffe2/sgd/lars_op.h"
2 #include <math.h>
3 #include "caffe2/utils/math.h"
4 
5 namespace caffe2 {
6 
7 template <>
8 void LarsOp<float, CPUContext>::Compute(
9  TIndex N,
10  const float* X_data,
11  const float* dX_data,
12  float offset,
13  float* lr_rescale_data) {
14  *lr_rescale_data = 1.0;
15 
16  float X_norm =
17  sqrtf((ConstEigenVectorMap<float>(X_data, N).array()).square().sum());
18 
19  if (X_norm > 0) {
20  float dX_norm =
21  sqrtf((ConstEigenVectorMap<float>(dX_data, N).array()).square().sum());
22  *lr_rescale_data /= (dX_norm / X_norm + offset);
23  }
24 }
25 
26 REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
27 
28 OPERATOR_SCHEMA(Lars)
29  .NumInputs(2)
30  .NumOutputs(1)
31  .SetDoc(R"DOC(
32 Implement Layer-wise Adaptive Rate Scaling (LARS) as in
33 https://arxiv.org/abs/1708.03888. Without weight decay, given a global
34 learning rate lr, parameter tensor X and its gradient dX, the local learning
35 rate for X will be
36 
37  local_lr = lr * norm(X) / ( norm(dX) + offset * norm(X) )
38 
39  = lr / ( norm(dX) / norm(X) + offset ),
40 
41 where offset is a preset hyper-parameter to avoid numerical issue.
42 In this implementation, we uses l2 norm and output the rescaling factor
43 
44  1 / ( norm(dX) / norm(X) + offset ).
45 
46 )DOC")
47  .Input(0, "X", "Parameter tensor")
48  .Input(1, "dX", "Gradient tensor")
49  .Output(0, "lr_rescale", "Local learning rate rescaling factor")
50  .Arg("offset", "rescaling offset parameter");
51 
52 SHOULD_NOT_DO_GRADIENT(Lars);
53 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...