3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
8 from collections
import namedtuple, defaultdict
9 from past.builtins
import basestring
15 from caffe2.proto
import caffe2_pb2
18 _OPTIMIZER_ITERATION_NAME =
"optimizer_iteration" 19 _LEARNING_RATE_INJECTION =
"lr_injection" 21 AuxOptimizerParams = namedtuple(
"AuxOptimizerParams", [
"local",
"shared"])
22 _optimizer_instance_count = defaultdict(int)
27 self.
_aux_params = AuxOptimizerParams(local=[], shared=[])
28 self.
_instance_num = _optimizer_instance_count[self.__class__.__name__]
29 _optimizer_instance_count[self.__class__.__name__] += 1
34 Adds optimization operators to the net for given parameter and its gradient 35 Parameter is specified by either 'param' being a ParameterInfo object. 36 In this case param.grad has to be set 38 Or by 'param' being a BlobReference and 'grad' being a BlobReference for its 41 def __call__(self, net, param_init_net, param, grad=None):
43 assert isinstance(param, parameter_info.ParameterInfo), (
44 "Expected parameter to be of type ParameterInfo, got {}".format(
47 assert param.grad
is not None 49 if isinstance(param, basestring):
51 param = parameter_info.ParameterInfo(
52 param_id=
None, param=param, grad=grad)
54 self.
_run(net, param_init_net, param)
56 def _run(self, net, param_init_net, param_info):
57 raise Exception(
"Not Implemented")
59 def get_cpu_blob_name(self, base_str, node_name=''):
60 classname = self.__class__.__name__
61 return '%s_%d_%s%s_cpu' % (classname, self.
_instance_num, base_str, node_name)
63 def get_gpu_blob_name(self, base_str, gpu_id, node_name):
64 classname = self.__class__.__name__
65 return '%s_%d_%s%s_gpu%d' % (
71 Returns a blob name that will be unique to the current device 72 and optimizer instance. 74 current_scope = scope.CurrentDeviceScope()
75 if current_scope
is None:
78 if current_scope.device_type == caffe2_pb2.CUDA:
80 base_str, current_scope.cuda_gpu_id, current_scope.node_name
85 def build_lr(self, net, param_init_net, base_learning_rate,
86 learning_rate_blob=
None, policy=
"fixed",
87 iter_val=0, **kwargs):
88 if learning_rate_blob
is None:
91 optimization_iter_blob = _OPTIMIZER_ITERATION_NAME
92 if not param_init_net.BlobIsDefined(optimization_iter_blob):
94 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
95 iteration = param_init_net.ConstantFill(
96 [], optimization_iter_blob, shape=[1],
98 dtype=core.DataType.INT64)
99 iter_mutex = param_init_net.CreateMutex(
100 [], [
"iteration_mutex"]
102 net.AtomicIter([iter_mutex, iteration], [iteration])
104 iteration = param_init_net.GetBlobRef(optimization_iter_blob)
106 if not net.BlobIsDefined(learning_rate_blob):
109 lr = net.LearningRate(
112 base_lr=-base_learning_rate,
117 lr = net.GetBlobRef(learning_rate_blob)
120 current_scope = scope.CurrentDeviceScope()
121 if (current_scope
is not None 122 and current_scope.device_type == caffe2_pb2.CUDA
124 lr_multiplier = net.CopyFromCPUInput(
140 def add_lr_multiplier(self, lr_multiplier, is_gpu_blob=False):
145 def dedup(net, sparse_dedup_aggregator, grad):
146 assert isinstance(grad, core.GradientSlice), (
147 "Dedup only works for sparse gradient, got {}".format(grad))
148 if sparse_dedup_aggregator:
149 return net.DeduplicateGradientSlices(
150 grad, aggregator=sparse_dedup_aggregator)
155 """Returns a list of auxiliary parameters. 158 aux_params: A namedtuple, AuxParams. 160 aux_params.local stores a list of blobs. Each blob is a local 161 auxiliary parameter. A local auxiliary parameter is a parameter in 162 parallel to a learning rate parameter. Take adagrad as an example, 163 the local auxiliary parameter is the squared sum parameter, because 164 every learning rate has a squared sum associated with it. 166 aux_params.shared also stores a list of blobs. Each blob is a shared 167 auxiliary parameter. A shared auxiliary parameter is a parameter 168 that is shared across all the learning rate parameters. Take adam as 169 an example, the iteration parameter is a shared parameter, because 170 all the learning rates share the same iteration parameter. 182 def scale_learning_rate(self, *args, **kwargs):
183 raise NotImplementedError(
184 "Optimizer Need to Implement `scale_learning_rate` method.")
188 def __init__(self, base_learning_rate=0.01, policy='fixed',
189 momentum=0.0, nesterov=1, sparse_dedup_aggregator=
None,
190 lars=
None, **kwargs):
191 super(SgdOptimizer, self).__init__()
200 def _run(self, net, param_init_net, param_info):
201 param = param_info.blob
202 grad = param_info.grad
206 "Expect positive base learning rate, got {}".format(
210 if self.
lars is not None and not isinstance(grad, core.GradientSlice):
211 assert self.
lars >= 0, (
212 'Lars offset must be nonnegative, got {}'.format(self.
lars))
213 lr_lars_multiplier = net.Lars(
217 current_scope = scope.CurrentDeviceScope()
220 is_gpu_blob=(current_scope
is not None 221 and current_scope.device_type == caffe2_pb2.CUDA),
226 lr_sign = -1
if self.
momentum else 1
234 dev = scope.CurrentDeviceScope()
236 dev = core.DeviceOption(caffe2_pb2.CPU)
240 ONE = param_init_net.ConstantFill(
242 "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
247 self._aux_params.shared.append(ONE)
250 momentum_data = param_init_net.ConstantFill(
251 param, str(param) +
"_momentum", value=0.)
252 self._aux_params.local.append(momentum_data)
254 if isinstance(grad, core.GradientSlice):
257 net.SparseMomentumSGDUpdate(
258 [grad.values, momentum_data, lr, param, grad.indices],
259 [grad.values, momentum_data, param],
263 net.ScatterWeightedSum(
264 [param, ONE, grad.indices, grad.values, lr],
269 net.MomentumSGDUpdate(
270 [grad, momentum_data, lr, param],
271 [grad, momentum_data, param],
278 [param, ONE, grad, coeff],
282 def scale_learning_rate(self, scale):
288 def __init__(self, base_learning_rate=0.1, momentum=0.0,
289 policy=
"fixed", nesterov=1, sparse_dedup_aggregator=
None,
291 super(MultiPrecisionSgdOptimizer, self).__init__(
292 base_learning_rate=base_learning_rate,
296 sparse_dedup_aggregator=sparse_dedup_aggregator,
300 def _run(self, net, param_init_net, param_info):
301 param = param_info.blob
302 param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
303 if param_info.blob_copy
is not None else None 306 if param_fp32
is None:
307 return SgdOptimizer._run(self, net, param_init_net, param_info)
309 grad = param_info.grad
313 "Expect positive base learning rate, got {}".format(
323 momentum_data = param_init_net.ConstantFill(
324 param_fp32, str(param) +
"_momentum", value=0.)
325 self._aux_params.local.append(momentum_data)
327 assert not isinstance(grad, core.GradientSlice), (
328 "MultiPrecisionSgd does not support sparse gradients")
331 grad_fp32 = net.HalfToFloat(grad, grad +
"_fp32")
334 net.MomentumSGDUpdate(
335 [grad_fp32, momentum_data, lr, param_fp32],
336 [grad_fp32, momentum_data, param_fp32],
341 net.FloatToHalf(param_fp32, param)
345 def __init__(self, base_learning_rate=0.1, momentum=0.0,
346 policy=
"fixed", nesterov=1, weight_decay=0.0001,
347 sparse_dedup_aggregator=
None,
349 super(FP16SgdOptimizer, self).__init__(
350 base_learning_rate=base_learning_rate,
354 sparse_dedup_aggregator=sparse_dedup_aggregator,
359 def _run(self, net, param_init_net, param_info, fp32_update=False):
362 param_name = str(param_info.blob)
366 if param_name.find(
"spatbn") != -1:
374 param = param_info.blob
375 param_fp32 = param_info.blob
377 if param_info.blob_copy
is None:
382 param = param_info.blob
383 param_fp32 = param_info.blob
385 if core.DataType.FLOAT
in param_info.blob_copy:
386 param = param_info.blob
387 param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
388 elif core.DataType.FLOAT16
in param_info.blob_copy:
389 param = param_info.blob_copy[core.DataType.FLOAT16]
390 param_fp32 = param_info.blob
393 "Unrecognized parameter format to be updated " 394 "by FP16 Optimizer. Parameter: {}".format(param_info.name)
397 grad = param_info.grad
402 "Expect positive base learning rate, got {}".format(
412 momentum_data_fp32 = param_init_net.ConstantFill(
413 param_fp32, str(param) +
"_momentum_fp32", value=0.)
415 momentum_data = param_init_net.FloatToHalf(
416 momentum_data_fp32, str(param) +
"_momentum")
418 self._aux_params.local.append(momentum_data)
420 assert not isinstance(grad, core.GradientSlice), (
421 "FP16Sgd does not support sparse gradients")
423 if fp32_update_flag == 0:
424 net.FP16MomentumSGDUpdate(
425 [grad, momentum_data, lr, param],
426 [grad, momentum_data, param],
432 net.FP32MomentumSGDUpdate(
433 [grad, momentum_data_fp32, lr, param],
434 [grad, momentum_data_fp32, param],
441 def __init__(self, weight_decay):
444 def _run(self, net, param_init_net, param_info):
445 dev = scope.CurrentDeviceScope()
447 dev = core.DeviceOption(caffe2_pb2.CPU)
449 ONE = param_init_net.ConstantFill(
451 "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
455 WD = param_init_net.ConstantFill(
456 [],
"wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
460 if isinstance(param_info.grad, core.GradientSlice):
462 "Weight decay does not yet support sparse gradients")
465 [param_info.grad, ONE, param_info.blob, WD],
471 def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
472 sparse_dedup_aggregator=
None, rowWise=
False, engine=
'',
473 lars=
None, **kwargs):
474 super(AdagradOptimizer, self).__init__()
485 def _run(self, net, param_init_net, param_info):
486 param = param_info.blob
487 grad = param_info.grad
492 if self.
lars is not None and not isinstance(grad, core.GradientSlice):
493 assert self.
lars >= 0, (
494 'Lars offset must be nonnegative, got {}'.format(self.
lars))
495 lr_lars_multiplier = net.Lars(
499 current_scope = scope.CurrentDeviceScope()
502 is_gpu_blob=(current_scope
is not None 503 and current_scope.device_type == caffe2_pb2.CUDA),
508 base_learning_rate=self.
alpha,
514 shapes, types = workspace.InferShapesAndTypes([param_init_net])
515 if str(param)
not in shapes:
518 shape = param_init_net.Shape(param, str(param) +
"_shape")
519 num_rows = param_init_net.Slice(
521 str(shape) +
"_numrows",
524 param_squared_sum = param_init_net.ConstantFill(
526 str(param) +
"_avg_squared_sum",
531 param_squared_sum = param_init_net.ConstantFill(
533 str(param) +
"_avg_squared_sum",
534 shape=[shapes[str(param)][0]],
539 param_squared_sum = param_init_net.ConstantFill(
541 str(param) +
"_squared_sum",
545 self._aux_params.local.append(param_squared_sum)
548 assert isinstance(grad, core.GradientSlice),\
549 'If SparseAdagrad with rowWise=True, gradient must be '\
550 'a gradientslice. PLease ensure that rowWise is not enabled '\
551 'for the dense Adagrad optimizer, as it is not supported.' 552 if isinstance(grad, core.GradientSlice):
553 assert self.
decay == 1.,\
554 'Decay is not implemented for SparseAdagrad and must be set to 1' 557 op =
'RowWiseSparseAdagrad' 561 [param, param_squared_sum, grad.indices, grad.values, lr],
562 [param, param_squared_sum],
568 [param, param_squared_sum, grad, lr],
569 [param, param_squared_sum],
571 decay=float(self.
decay),
575 def scale_learning_rate(self, scale):
581 def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
582 sparse_dedup_aggregator=
None, engine=
''):
583 super(FtrlOptimizer, self).__init__()
591 def _run(self, net, param_init_net, param_info):
592 param = param_info.blob
593 grad = param_info.grad
598 nz = param_init_net.ConstantFill(
600 str(param) +
"_ftrl_nz",
604 self._aux_params.local.append(nz)
605 if isinstance(grad, core.GradientSlice):
608 [param, nz, grad.indices, grad.values],
627 def scale_learning_rate(self, scale):
633 def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
634 policy=
'fixed', sparse_dedup_aggregator=
None, rowWise=
False,
635 engine=
'', **kwargs):
636 super(AdamOptimizer, self).__init__()
647 def _run(self, net, param_init_net, param_info):
648 param = param_info.blob
649 grad = param_info.grad
656 base_learning_rate=self.
alpha,
661 m1 = param_init_net.ConstantFill(
663 param +
"_first_moment",
667 shapes, types = workspace.InferShapesAndTypes([param_init_net])
668 m2 = param_init_net.ConstantFill(
670 param +
"_avg_second_moment",
671 shape=[shapes[param][0]],
677 m2 = param_init_net.ConstantFill(
679 param +
"_second_moment",
683 self._aux_params.shared.append(iteration)
684 self._aux_params.local.append(m1)
685 self._aux_params.local.append(m2)
688 assert isinstance(grad, core.GradientSlice),\
689 'If SparseAdam with rowWise=True, gradient must be '\
690 'a gradientslice. PLease ensure that rowWise is not enabled '\
691 'for the dense Adam optimizer, as it is not supported.' 692 if isinstance(grad, core.GradientSlice):
695 op =
'RowWiseSparseAdam' 699 [param, m1, m2, grad.indices, grad.values, lr, iteration],
708 [param, m1, m2, grad, lr, iteration],
714 def scale_learning_rate(self, scale):
720 """YellowFin: An automatic tuner for momentum SGD 722 See https://arxiv.org/abs/1706.03471 for more details. This implementation 723 has separate learning rate and momentum per each parameter.""" 733 sparse_dedup_aggregator=
None,
735 super(YellowFinOptimizer, self).__init__()
746 def _run(self, net, param_init_net, param_info):
751 SCALARS_MEMORY_SIZE = 5
753 param = param_info.blob
754 grad = param_info.grad
755 moment = param_init_net.ConstantFill(
760 curv_win = param_init_net.ConstantFill(
766 g_avg = param_init_net.ConstantFill(
771 g2_avg = param_init_net.ConstantFill(
776 lr_avg = param_init_net.ConstantFill(
782 mu_avg = param_init_net.ConstantFill(
788 scalars_memory = param_init_net.ConstantFill(
790 param +
"_scalars_memory",
791 shape=[SCALARS_MEMORY_SIZE],
795 assert self.
alpha > 0
796 assert not isinstance(grad, core.GradientSlice), \
797 "YellowFin does not support sparse gradients" 799 if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
801 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
802 iteration = param_init_net.ConstantFill(
804 _OPTIMIZER_ITERATION_NAME,
807 dtype=core.DataType.INT64)
808 iter_mutex = param_init_net.CreateMutex([],
810 net.AtomicIter([iter_mutex, iteration], [iteration])
812 iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
814 self._aux_params.shared.append(iteration)
815 self._aux_params.local.append(moment)
816 self._aux_params.local.append(lr_avg)
817 self._aux_params.local.append(mu_avg)
818 self._aux_params.local.append(curv_win)
819 self._aux_params.local.append(g_avg)
820 self._aux_params.local.append(g2_avg)
821 self._aux_params.local.append(scalars_memory)
835 yf_in_out_args + [grad, iteration],
842 def scale_learning_rate(self, scale):
858 super(RmsPropOptimizer, self).__init__()
867 def _run(self, net, param_init_net, param_info):
868 param = param_info.blob
869 grad = param_info.grad
871 assert self.
alpha > 0
872 assert not isinstance(grad, core.GradientSlice), \
873 "RmsPropOptimizer doesn't support sparse gradients" 875 dev = scope.CurrentDeviceScope()
877 dev = core.DeviceOption(caffe2_pb2.CPU)
879 ONE = param_init_net.ConstantFill(
881 "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
889 base_learning_rate=-self.
alpha,
894 grad_o = param_init_net.ConstantFill(
896 str(param) +
"_grad_o",
900 ms = param_init_net.ConstantFill(
902 str(param) +
"_mean_squares",
906 mom = param_init_net.ConstantFill(
908 str(param) +
"_momentum",
912 self._aux_params.local.append(ms)
913 self._aux_params.local.append(mom)
916 [grad, ms, mom, ONE],
924 net.MomentumSGDUpdate(
925 [grad_o, mom, lr, param],
926 [grad_o, mom, param],
929 def scale_learning_rate(self, scale):
934 def _get_param_to_device(model):
937 param_to_device = core.InferBlobDevices(model.net)
938 param_to_device.update(core.InferBlobDevices(model.param_init_net))
939 return param_to_device
942 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
943 device = default_device
944 param_to_device = param_to_device
or {}
948 if param_name
in param_to_device:
949 device = param_to_device[param_name]
951 if isinstance(grad, core.GradientSlice):
953 if str(grad.values)
in param_to_device:
954 device = param_to_device[str(grad.values)]
955 elif str(grad.indices)
in param_to_device:
956 device = param_to_device[str(grad.indices)]
958 grad_name = str(grad)
959 if grad_name
in param_to_device:
960 device = param_to_device[grad_name]
962 assert device
is not None,\
963 "Cannot infer device for {}: no op creates it".format(param_name)
967 def get_lr_injection():
969 Gets current value for lr_injection, a multiplier for all base 971 Must set allow_lr_injection=True when building optimizer, as it 972 relies on synchronization over CPU. 974 return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
977 def set_lr_injection(lr_injection_value):
979 Sets lr_injection, a multiplier for all base learning rates. 980 Must set allow_lr_injection=True when building optimizer, as it 981 relies on synchronization over CPU. 984 _LEARNING_RATE_INJECTION,
986 [float(lr_injection_value)],
992 def _calc_norm_ratio(
993 model, params, name_scope, param_to_device, max_gradient_norm
995 with core.NameScope(name_scope):
996 grad_squared_sums = []
997 for i, param
in enumerate(params):
998 device = get_param_device(
999 str(param.blob), param.grad, param_to_device
1002 with core.DeviceScope(device):
1008 )
else param.grad.values
1011 grad_squared_sum_name =
'grad_{}_squared_sum'.format(i)
1012 grad_squared_sum = model.net.SumSqrElements(
1014 grad_squared_sum_name,
1016 grad_squared_sum_cpu = model.net.EnsureCPUOutput(
1019 grad_squared_sums.append(grad_squared_sum_cpu)
1021 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
1022 grad_squared_full_sum = model.net.Sum(
1024 'grad_squared_full_sum',
1026 global_norm = model.net.Pow(
1027 grad_squared_full_sum,
1031 clip_norm = model.param_init_net.ConstantFill(
1035 value=float(max_gradient_norm),
1037 max_norm = model.net.Max(
1038 [global_norm, clip_norm],
1041 norm_ratio = model.net.Div(
1042 [clip_norm, max_norm],
1052 use_param_info_optim=
True,
1053 max_gradient_norm=
None,
1054 allow_lr_injection=
False,
1056 param_to_device = _get_param_to_device(model)
1062 for param_info
in model.GetOptimizationParamInfo():
1063 if weights_only
and param_info.blob
not in model.weights:
1065 params.append(param_info)
1067 lr_multiplier =
None 1068 if max_gradient_norm
is not None:
1069 lr_multiplier = _calc_norm_ratio(
1072 'norm_clipped_grad_update',
1077 if allow_lr_injection:
1078 if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
1079 lr_injection = model.param_init_net.ConstantFill(
1081 _LEARNING_RATE_INJECTION,
1086 lr_injection = _LEARNING_RATE_INJECTION
1088 if lr_multiplier
is None:
1089 lr_multiplier = lr_injection
1091 lr_multiplier = model.net.Mul(
1092 [lr_multiplier, lr_injection],
1096 optimizer.add_lr_multiplier(lr_multiplier)
1098 for param_info
in params:
1099 param_name = str(param_info.blob)
1101 device = get_param_device(param_name, param_info.grad, param_to_device)
1103 with core.DeviceScope(device):
1104 if param_info.optimizer
and use_param_info_optim:
1105 param_info.optimizer(model.net, model.param_init_net, param_info)
1107 optimizer(model.net, model.param_init_net, param_info)
1111 def add_weight_decay(model, weight_decay):
1112 """Adds a decay to weights in the model. 1114 This is a form of L2 regularization. 1117 weight_decay: strength of the regularization 1123 use_param_info_optim=
False,
1130 max_gradient_norm=
None,
1131 allow_lr_injection=
False,
1134 sgd_optimizer =
SgdOptimizer(base_learning_rate, **kwargs)
1138 max_gradient_norm=max_gradient_norm,
1139 allow_lr_injection=allow_lr_injection,
1143 def build_multi_precision_sgd(
1146 max_gradient_norm=
None,
1147 allow_lr_injection=
False,
1151 base_learning_rate, **kwargs
1155 multi_prec_sgd_optimizer,
1156 max_gradient_norm=max_gradient_norm,
1157 allow_lr_injection=allow_lr_injection,
1161 def build_fp16_sgd(model, base_learning_rate, **kwargs):
1163 base_learning_rate, **kwargs
1165 return _build(model, fp16_sgd_optimizer)
1168 def build_ftrl(model, engine="SIMD", **kwargs):
1169 if engine ==
"SIMD":
1170 assert core.IsOperator(
'Ftrl_ENGINE_SIMD')
1171 assert core.IsOperator(
'SparseFtrl_ENGINE_SIMD')
1173 return _build(model, ftrl_optimizer)
1180 max_gradient_norm=
None,
1181 allow_lr_injection=
False,
1188 max_gradient_norm=max_gradient_norm,
1189 allow_lr_injection=allow_lr_injection,
1196 max_gradient_norm=
None,
1197 allow_lr_injection=
False,
1200 adam_optimizer =
AdamOptimizer(alpha=base_learning_rate, **kwargs)
1204 max_gradient_norm=max_gradient_norm,
1205 allow_lr_injection=allow_lr_injection,
1209 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
1211 alpha=base_learning_rate,
1213 return _build(model, yellowfin_optimizer)
1219 max_gradient_norm=
None,
1220 allow_lr_injection=
False,
1227 max_gradient_norm=max_gradient_norm,
1228 allow_lr_injection=allow_lr_injection,
def get_cpu_blob_name(self, base_str, node_name='')
Module caffe2.python.optimizer.
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
def dedup(net, sparse_dedup_aggregator, grad)
def _run(self, net, param_init_net, param_info)
def make_unique_blob_name(self, base_str)
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
def get_auxiliary_parameters(self)
def add_lr_multiplier(self, lr_multiplier, is_gpu_blob=False)