1 #include "caffe2/sgd/lars_op.h" 3 #include "caffe2/utils/math.h" 8 void LarsOp<float, CPUContext>::Compute(
13 float* lr_rescale_data) {
14 *lr_rescale_data = 1.0;
17 sqrtf((ConstEigenVectorMap<float>(X_data, N).array()).square().sum());
21 sqrtf((ConstEigenVectorMap<float>(dX_data, N).array()).square().sum());
22 *lr_rescale_data /= (dX_norm / X_norm + offset);
26 REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
32 Implement Layer-wise Adaptive Rate Scaling (LARS) as in 33 https://arxiv.org/abs/1708.03888. Without weight decay, given a global 34 learning rate lr, parameter tensor X and its gradient dX, the local learning 37 local_lr = lr * norm(X) / ( norm(dX) + offset * norm(X) ) 39 = lr / ( norm(dX) / norm(X) + offset ), 41 where offset is a preset hyper-parameter to avoid numerical issue. 42 In this implementation, we uses l2 norm and output the rescaling factor 44 1 / ( norm(dX) / norm(X) + offset ). 47 .Input(0, "X",
"Parameter tensor")
48 .Input(1,
"dX",
"Gradient tensor")
49 .Output(0,
"lr_rescale",
"Local learning rate rescaling factor")
50 .Arg(
"offset",
"rescaling offset parameter");
52 SHOULD_NOT_DO_GRADIENT(Lars);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...