Caffe2 - Python API
A deep learning, cross platform ML framework
optimizer.py
1 ## @package optimizer
2 # Module caffe2.python.optimizer
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from collections import namedtuple, defaultdict
9 from past.builtins import basestring
10 
11 import numpy as np
12 
13 from caffe2.python import core, scope, workspace
14 from caffe2.python.modeling import parameter_info
15 from caffe2.proto import caffe2_pb2
16 
17 
18 _OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
19 _LEARNING_RATE_INJECTION = "lr_injection"
20 
21 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
22 _optimizer_instance_count = defaultdict(int)
23 
24 
25 class Optimizer(object):
26  def __init__(self):
27  self._aux_params = AuxOptimizerParams(local=[], shared=[])
28  self._instance_num = _optimizer_instance_count[self.__class__.__name__]
29  _optimizer_instance_count[self.__class__.__name__] += 1
30  self._lr_multiplier = None
31  self._lr_multiplier_on_gpu = False
32 
33  '''
34  Adds optimization operators to the net for given parameter and its gradient
35  Parameter is specified by either 'param' being a ParameterInfo object.
36  In this case param.grad has to be set
37 
38  Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
39  gradient.
40  '''
41  def __call__(self, net, param_init_net, param, grad=None):
42  if grad is None:
43  assert isinstance(param, parameter_info.ParameterInfo), (
44  "Expected parameter to be of type ParameterInfo, got {}".format(
45  param
46  ))
47  assert param.grad is not None
48  else:
49  if isinstance(param, basestring):
50  param = core.BlobReference(param)
51  param = parameter_info.ParameterInfo(
52  param_id=None, param=param, grad=grad)
53 
54  self._run(net, param_init_net, param)
55 
56  def _run(self, net, param_init_net, param_info):
57  raise Exception("Not Implemented")
58 
59  def get_cpu_blob_name(self, base_str, node_name=''):
60  classname = self.__class__.__name__
61  return '%s_%d_%s%s_cpu' % (classname, self._instance_num, base_str, node_name)
62 
63  def get_gpu_blob_name(self, base_str, gpu_id, node_name):
64  classname = self.__class__.__name__
65  return '%s_%d_%s%s_gpu%d' % (
66  classname, self._instance_num, base_str, node_name, gpu_id,
67  )
68 
69  def make_unique_blob_name(self, base_str):
70  """
71  Returns a blob name that will be unique to the current device
72  and optimizer instance.
73  """
74  current_scope = scope.CurrentDeviceScope()
75  if current_scope is None:
76  return self.get_cpu_blob_name(base_str)
77 
78  if current_scope.device_type == caffe2_pb2.CUDA:
79  return self.get_gpu_blob_name(
80  base_str, current_scope.cuda_gpu_id, current_scope.node_name
81  )
82  else:
83  return self.get_cpu_blob_name(base_str, current_scope.node_name)
84 
85  def build_lr(self, net, param_init_net, base_learning_rate,
86  learning_rate_blob=None, policy="fixed",
87  iter_val=0, **kwargs):
88  if learning_rate_blob is None:
89  learning_rate_blob = self.make_unique_blob_name('lr')
90 
91  optimization_iter_blob = _OPTIMIZER_ITERATION_NAME
92  if not param_init_net.BlobIsDefined(optimization_iter_blob):
93  # Add training operators.
94  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
95  iteration = param_init_net.ConstantFill(
96  [], optimization_iter_blob, shape=[1],
97  value=iter_val,
98  dtype=core.DataType.INT64)
99  iter_mutex = param_init_net.CreateMutex(
100  [], ["iteration_mutex"]
101  )
102  net.AtomicIter([iter_mutex, iteration], [iteration])
103  else:
104  iteration = param_init_net.GetBlobRef(optimization_iter_blob)
105 
106  if not net.BlobIsDefined(learning_rate_blob):
107  # There is one interesting thing here: since we are minimizing, we are
108  # doing "descent" so the learning rate is set to be negative.
109  lr = net.LearningRate(
110  [iteration],
111  learning_rate_blob,
112  base_lr=-base_learning_rate,
113  policy=policy,
114  **kwargs
115  )
116  else:
117  lr = net.GetBlobRef(learning_rate_blob)
118 
119  if self._lr_multiplier is not None:
120  current_scope = scope.CurrentDeviceScope()
121  if (current_scope is not None
122  and current_scope.device_type == caffe2_pb2.CUDA
123  and not self._lr_multiplier_on_gpu):
124  lr_multiplier = net.CopyFromCPUInput(
125  self._lr_multiplier,
126  self.make_unique_blob_name('lr_multiplier')
127  )
128  else:
129  lr_multiplier = self._lr_multiplier
130 
131  scaled_lr = net.Mul(
132  [lr, lr_multiplier],
133  self.make_unique_blob_name('scaled_lr'),
134  broadcast=1,
135  )
136  lr = scaled_lr
137 
138  return lr, iteration
139 
140  def add_lr_multiplier(self, lr_multiplier, is_gpu_blob=False):
141  self._lr_multiplier = lr_multiplier
142  self._lr_multiplier_on_gpu = is_gpu_blob
143 
144  @staticmethod
145  def dedup(net, sparse_dedup_aggregator, grad):
146  assert isinstance(grad, core.GradientSlice), (
147  "Dedup only works for sparse gradient, got {}".format(grad))
148  if sparse_dedup_aggregator:
149  return net.DeduplicateGradientSlices(
150  grad, aggregator=sparse_dedup_aggregator)
151  else:
152  return grad
153 
155  """Returns a list of auxiliary parameters.
156 
157  Returns:
158  aux_params: A namedtuple, AuxParams.
159 
160  aux_params.local stores a list of blobs. Each blob is a local
161  auxiliary parameter. A local auxiliary parameter is a parameter in
162  parallel to a learning rate parameter. Take adagrad as an example,
163  the local auxiliary parameter is the squared sum parameter, because
164  every learning rate has a squared sum associated with it.
165 
166  aux_params.shared also stores a list of blobs. Each blob is a shared
167  auxiliary parameter. A shared auxiliary parameter is a parameter
168  that is shared across all the learning rate parameters. Take adam as
169  an example, the iteration parameter is a shared parameter, because
170  all the learning rates share the same iteration parameter.
171  """
172  return self._aux_params
173 
174  # TODO(xlwang): In transfer learning, parameter initialized from pretrained
175  # model might require a different learning rate than otherwise initialized.
176  # To this end, here we implement a python solution where
177  # `base_learning_rate` is scaled by `scale`, by calling
178  # `scale_learning_rate`; Alternatively, we can achieve same effect by
179  # rewriting the LearningRate operator in C++
180  # Note that it is the responsibility of specific optimizer to decide what
181  # logic should be used for `scale_learning_rate`
182  def scale_learning_rate(self, *args, **kwargs):
183  raise NotImplementedError(
184  "Optimizer Need to Implement `scale_learning_rate` method.")
185 
186 
188  def __init__(self, base_learning_rate=0.01, policy='fixed',
189  momentum=0.0, nesterov=1, sparse_dedup_aggregator=None,
190  lars=None, **kwargs):
191  super(SgdOptimizer, self).__init__()
192  self.base_learning_rate = base_learning_rate
193  self.policy = policy
194  self.momentum = momentum
195  self.nesterov = nesterov
196  self.sparse_dedup_aggregator = sparse_dedup_aggregator
197  self.lars = lars
198  self.init_kwargs = kwargs
199 
200  def _run(self, net, param_init_net, param_info):
201  param = param_info.blob
202  grad = param_info.grad
203  if self.base_learning_rate == 0:
204  return
205  assert self.base_learning_rate > 0, (
206  "Expect positive base learning rate, got {}".format(
207  self.base_learning_rate))
208 
209  # TODO(zqq): support LARS for sparse parameters
210  if self.lars is not None and not isinstance(grad, core.GradientSlice):
211  assert self.lars >= 0, (
212  'Lars offset must be nonnegative, got {}'.format(self.lars))
213  lr_lars_multiplier = net.Lars(
214  [param, grad],
215  self.make_unique_blob_name(str(param) + "_lars"),
216  offset=self.lars)
217  current_scope = scope.CurrentDeviceScope()
218  self.add_lr_multiplier(
219  lr_lars_multiplier,
220  is_gpu_blob=(current_scope is not None
221  and current_scope.device_type == caffe2_pb2.CUDA),
222  )
223 
224  # We need negative sign for LR when used directly with WeightedSum
225  # below.
226  lr_sign = -1 if self.momentum else 1
227  lr, _ = self.build_lr(
228  net, param_init_net,
229  base_learning_rate=self.base_learning_rate * lr_sign,
230  policy=self.policy,
231  **(self.init_kwargs)
232  )
233 
234  dev = scope.CurrentDeviceScope()
235  if dev is None:
236  dev = core.DeviceOption(caffe2_pb2.CPU)
237 
238  # Each GPU/CPU must have its own ONE blob, thus modify the name
239  # to include device information.
240  ONE = param_init_net.ConstantFill(
241  [],
242  "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
243  shape=[1],
244  value=1.0
245  )
246 
247  self._aux_params.shared.append(ONE)
248 
249  if self.momentum > 0:
250  momentum_data = param_init_net.ConstantFill(
251  param, str(param) + "_momentum", value=0.)
252  self._aux_params.local.append(momentum_data)
253 
254  if isinstance(grad, core.GradientSlice):
255  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
256  if self.momentum > 0.:
257  net.SparseMomentumSGDUpdate(
258  [grad.values, momentum_data, lr, param, grad.indices],
259  [grad.values, momentum_data, param],
260  momentum=self.momentum,
261  nesterov=self.nesterov)
262  else:
263  net.ScatterWeightedSum(
264  [param, ONE, grad.indices, grad.values, lr],
265  param
266  )
267  else:
268  if self.momentum > 0.:
269  net.MomentumSGDUpdate(
270  [grad, momentum_data, lr, param],
271  [grad, momentum_data, param],
272  momentum=self.momentum,
273  nesterov=self.nesterov)
274  else:
275  coeff = lr
276 
277  net.WeightedSum(
278  [param, ONE, grad, coeff],
279  param
280  )
281 
282  def scale_learning_rate(self, scale):
283  self.base_learning_rate *= scale
284  return
285 
286 
288  def __init__(self, base_learning_rate=0.1, momentum=0.0,
289  policy="fixed", nesterov=1, sparse_dedup_aggregator=None,
290  **kwargs):
291  super(MultiPrecisionSgdOptimizer, self).__init__(
292  base_learning_rate=base_learning_rate,
293  policy=policy,
294  momentum=momentum,
295  nesterov=nesterov,
296  sparse_dedup_aggregator=sparse_dedup_aggregator,
297  **kwargs
298  )
299 
300  def _run(self, net, param_init_net, param_info):
301  param = param_info.blob
302  param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
303  if param_info.blob_copy is not None else None
304 
305  # If we have a straight fp32 parameter, run the base class
306  if param_fp32 is None:
307  return SgdOptimizer._run(self, net, param_init_net, param_info)
308 
309  grad = param_info.grad
310  if self.base_learning_rate == 0:
311  return
312  assert self.base_learning_rate > 0, (
313  "Expect positive base learning rate, got {}".format(
314  self.base_learning_rate))
315 
316  lr, _ = self.build_lr(
317  net, param_init_net,
318  base_learning_rate=-self.base_learning_rate,
319  policy=self.policy,
320  **(self.init_kwargs)
321  )
322 
323  momentum_data = param_init_net.ConstantFill(
324  param_fp32, str(param) + "_momentum", value=0.)
325  self._aux_params.local.append(momentum_data)
326 
327  assert not isinstance(grad, core.GradientSlice), (
328  "MultiPrecisionSgd does not support sparse gradients")
329 
330  # Copy gradient to fp32
331  grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
332 
333  # update (fused) in fp32
334  net.MomentumSGDUpdate(
335  [grad_fp32, momentum_data, lr, param_fp32],
336  [grad_fp32, momentum_data, param_fp32],
337  momentum=self.momentum,
338  nesterov=self.nesterov)
339 
340  # Copy updated param back to fp16
341  net.FloatToHalf(param_fp32, param)
342 
343 
345  def __init__(self, base_learning_rate=0.1, momentum=0.0,
346  policy="fixed", nesterov=1, weight_decay=0.0001,
347  sparse_dedup_aggregator=None,
348  **kwargs):
349  super(FP16SgdOptimizer, self).__init__(
350  base_learning_rate=base_learning_rate,
351  policy=policy,
352  momentum=momentum,
353  nesterov=nesterov,
354  sparse_dedup_aggregator=sparse_dedup_aggregator,
355  **kwargs
356  )
357  self.weight_decay = weight_decay
358 
359  def _run(self, net, param_init_net, param_info, fp32_update=False):
360 
361  fp32_update_flag = 0
362  param_name = str(param_info.blob)
363 
364  # should only be triggered in FP16 training by SpatialBN, which
365  # requires FP32 params in CuDNN.
366  if param_name.find("spatbn") != -1:
367  fp32_update = True
368 
369  if fp32_update:
370  # doing a 32bit update
371  # Have to assume param_info.blob is FP32 as there is no way
372  # (that i currently know of) to query a blob's type in python
373  fp32_update_flag = 1
374  param = param_info.blob
375  param_fp32 = param_info.blob
376  else:
377  if param_info.blob_copy is None:
378  # doing a 32bit update
379  # Have to assume param_info.blob is FP32 as there is no way
380  # (that i currently know of) to query a blob's type in python
381  fp32_update_flag = 1
382  param = param_info.blob
383  param_fp32 = param_info.blob
384  else:
385  if core.DataType.FLOAT in param_info.blob_copy:
386  param = param_info.blob
387  param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
388  elif core.DataType.FLOAT16 in param_info.blob_copy:
389  param = param_info.blob_copy[core.DataType.FLOAT16]
390  param_fp32 = param_info.blob
391  else:
392  assert (False), (
393  "Unrecognized parameter format to be updated "
394  "by FP16 Optimizer. Parameter: {}".format(param_info.name)
395  )
396 
397  grad = param_info.grad
398 
399  if self.base_learning_rate == 0:
400  return
401  assert self.base_learning_rate > 0, (
402  "Expect positive base learning rate, got {}".format(
403  self.base_learning_rate))
404 
405  lr, _ = self.build_lr(
406  net, param_init_net,
407  base_learning_rate=-self.base_learning_rate,
408  policy=self.policy,
409  **(self.init_kwargs)
410  )
411 
412  momentum_data_fp32 = param_init_net.ConstantFill(
413  param_fp32, str(param) + "_momentum_fp32", value=0.)
414 
415  momentum_data = param_init_net.FloatToHalf(
416  momentum_data_fp32, str(param) + "_momentum")
417 
418  self._aux_params.local.append(momentum_data)
419 
420  assert not isinstance(grad, core.GradientSlice), (
421  "FP16Sgd does not support sparse gradients")
422 
423  if fp32_update_flag == 0:
424  net.FP16MomentumSGDUpdate(
425  [grad, momentum_data, lr, param],
426  [grad, momentum_data, param],
427  momentum=self.momentum,
428  nesterov=self.nesterov,
429  weight_decay=self.weight_decay)
430  else:
431  # flag set to 1, therefore doing FP32 update
432  net.FP32MomentumSGDUpdate(
433  [grad, momentum_data_fp32, lr, param],
434  [grad, momentum_data_fp32, param],
435  momentum=self.momentum,
436  nesterov=self.nesterov,
437  weight_decay=self.weight_decay)
438 
439 
441  def __init__(self, weight_decay):
442  self.weight_decay = weight_decay
443 
444  def _run(self, net, param_init_net, param_info):
445  dev = scope.CurrentDeviceScope()
446  if dev is None:
447  dev = core.DeviceOption(caffe2_pb2.CPU)
448 
449  ONE = param_init_net.ConstantFill(
450  [],
451  "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
452  shape=[1],
453  value=1.0
454  )
455  WD = param_init_net.ConstantFill(
456  [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
457  shape=[1], value=self.weight_decay
458  )
459 
460  if isinstance(param_info.grad, core.GradientSlice):
461  raise ValueError(
462  "Weight decay does not yet support sparse gradients")
463  else:
464  net.WeightedSum(
465  [param_info.grad, ONE, param_info.blob, WD],
466  param_info.grad,
467  )
468 
469 
471  def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
472  sparse_dedup_aggregator=None, rowWise=False, engine='',
473  lars=None, **kwargs):
474  super(AdagradOptimizer, self).__init__()
475  self.alpha = alpha
476  self.epsilon = epsilon
477  self.decay = decay
478  self.policy = policy
479  self.sparse_dedup_aggregator = sparse_dedup_aggregator
480  self.rowWise = rowWise
481  self.engine = engine
482  self.lars = lars
483  self.init_kwargs = kwargs
484 
485  def _run(self, net, param_init_net, param_info):
486  param = param_info.blob
487  grad = param_info.grad
488 
489  if self.alpha <= 0:
490  return
491 
492  if self.lars is not None and not isinstance(grad, core.GradientSlice):
493  assert self.lars >= 0, (
494  'Lars offset must be nonnegative, got {}'.format(self.lars))
495  lr_lars_multiplier = net.Lars(
496  [param, grad],
497  self.make_unique_blob_name(str(param) + "_lars"),
498  offset=self.lars)
499  current_scope = scope.CurrentDeviceScope()
500  self.add_lr_multiplier(
501  lr_lars_multiplier,
502  is_gpu_blob=(current_scope is not None
503  and current_scope.device_type == caffe2_pb2.CUDA),
504  )
505 
506  lr, _ = self.build_lr(
507  net, param_init_net,
508  base_learning_rate=self.alpha,
509  policy=self.policy,
510  **(self.init_kwargs)
511  )
512 
513  if self.rowWise:
514  shapes, types = workspace.InferShapesAndTypes([param_init_net])
515  if str(param) not in shapes:
516  # Type/shape inference is not available for this param, fallback
517  # on Shape/Slice logic
518  shape = param_init_net.Shape(param, str(param) + "_shape")
519  num_rows = param_init_net.Slice(
520  [shape],
521  str(shape) + "_numrows",
522  starts=[0], ends=[1]
523  )
524  param_squared_sum = param_init_net.ConstantFill(
525  num_rows,
526  str(param) + "_avg_squared_sum",
527  input_as_shape=1,
528  value=0.0
529  )
530  else:
531  param_squared_sum = param_init_net.ConstantFill(
532  [],
533  str(param) + "_avg_squared_sum",
534  shape=[shapes[str(param)][0]],
535  value=0.0
536  )
537 
538  else:
539  param_squared_sum = param_init_net.ConstantFill(
540  [param],
541  str(param) + "_squared_sum",
542  value=0.0
543  )
544 
545  self._aux_params.local.append(param_squared_sum)
546 
547  if self.rowWise:
548  assert isinstance(grad, core.GradientSlice),\
549  'If SparseAdagrad with rowWise=True, gradient must be '\
550  'a gradientslice. PLease ensure that rowWise is not enabled '\
551  'for the dense Adagrad optimizer, as it is not supported.'
552  if isinstance(grad, core.GradientSlice):
553  assert self.decay == 1.,\
554  'Decay is not implemented for SparseAdagrad and must be set to 1'
555  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
556  if self.rowWise:
557  op = 'RowWiseSparseAdagrad'
558  else:
559  op = 'SparseAdagrad'
560  net.__getattr__(op)(
561  [param, param_squared_sum, grad.indices, grad.values, lr],
562  [param, param_squared_sum],
563  epsilon=self.epsilon,
564  engine=self.engine
565  )
566  else:
567  net.Adagrad(
568  [param, param_squared_sum, grad, lr],
569  [param, param_squared_sum],
570  epsilon=self.epsilon,
571  decay=float(self.decay),
572  engine=self.engine
573  )
574 
575  def scale_learning_rate(self, scale):
576  self.alpha *= scale
577  return
578 
579 
581  def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
582  sparse_dedup_aggregator=None, engine=''):
583  super(FtrlOptimizer, self).__init__()
584  self.alpha = alpha
585  self.beta = beta
586  self.lambda1 = lambda1
587  self.lambda2 = lambda2
588  self.sparse_dedup_aggregator = sparse_dedup_aggregator
589  self.engine = engine
590 
591  def _run(self, net, param_init_net, param_info):
592  param = param_info.blob
593  grad = param_info.grad
594 
595  if self.alpha <= 0:
596  return
597 
598  nz = param_init_net.ConstantFill(
599  [param],
600  str(param) + "_ftrl_nz",
601  extra_shape=[2],
602  value=0.0
603  )
604  self._aux_params.local.append(nz)
605  if isinstance(grad, core.GradientSlice):
606  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
607  net.SparseFtrl(
608  [param, nz, grad.indices, grad.values],
609  [param, nz],
610  engine=self.engine,
611  alpha=self.alpha,
612  beta=self.beta,
613  lambda1=self.lambda1,
614  lambda2=self.lambda2
615  )
616  else:
617  net.Ftrl(
618  [param, nz, grad],
619  [param, nz],
620  engine=self.engine,
621  alpha=self.alpha,
622  beta=self.beta,
623  lambda1=self.lambda1,
624  lambda2=self.lambda2
625  )
626 
627  def scale_learning_rate(self, scale):
628  self.alpha *= scale
629  return
630 
631 
633  def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
634  policy='fixed', sparse_dedup_aggregator=None, rowWise=False,
635  engine='', **kwargs):
636  super(AdamOptimizer, self).__init__()
637  self.alpha = alpha
638  self.beta1 = beta1
639  self.beta2 = beta2
640  self.epsilon = epsilon
641  self.policy = policy
642  self.sparse_dedup_aggregator = sparse_dedup_aggregator
643  self.rowWise = rowWise
644  self.engine = engine
645  self.init_kwargs = kwargs
646 
647  def _run(self, net, param_init_net, param_info):
648  param = param_info.blob
649  grad = param_info.grad
650 
651  if self.alpha <= 0:
652  return
653 
654  lr, iteration = self.build_lr(
655  net, param_init_net,
656  base_learning_rate=self.alpha,
657  policy=self.policy,
658  **(self.init_kwargs)
659  )
660 
661  m1 = param_init_net.ConstantFill(
662  [param],
663  param + "_first_moment",
664  value=0.0
665  )
666  if self.rowWise:
667  shapes, types = workspace.InferShapesAndTypes([param_init_net])
668  m2 = param_init_net.ConstantFill(
669  [],
670  param + "_avg_second_moment",
671  shape=[shapes[param][0]],
672  value=0.0
673  )
674 
675  else:
676 
677  m2 = param_init_net.ConstantFill(
678  [param],
679  param + "_second_moment",
680  value=0.0
681  )
682 
683  self._aux_params.shared.append(iteration)
684  self._aux_params.local.append(m1)
685  self._aux_params.local.append(m2)
686 
687  if self.rowWise:
688  assert isinstance(grad, core.GradientSlice),\
689  'If SparseAdam with rowWise=True, gradient must be '\
690  'a gradientslice. PLease ensure that rowWise is not enabled '\
691  'for the dense Adam optimizer, as it is not supported.'
692  if isinstance(grad, core.GradientSlice):
693  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
694  if self.rowWise:
695  op = 'RowWiseSparseAdam'
696  else:
697  op = 'SparseAdam'
698  net.__getattr__(op)(
699  [param, m1, m2, grad.indices, grad.values, lr, iteration],
700  [param, m1, m2],
701  beta1=self.beta1,
702  beta2=self.beta2,
703  epsilon=self.epsilon
704  )
705 
706  else:
707  net.Adam(
708  [param, m1, m2, grad, lr, iteration],
709  [param, m1, m2],
710  beta1=self.beta1,
711  beta2=self.beta2,
712  epsilon=self.epsilon)
713 
714  def scale_learning_rate(self, scale):
715  self.alpha *= scale
716  return
717 
718 
720  """YellowFin: An automatic tuner for momentum SGD
721 
722  See https://arxiv.org/abs/1706.03471 for more details. This implementation
723  has separate learning rate and momentum per each parameter."""
724 
725  def __init__(self,
726  alpha=0.1,
727  mu=0.0,
728  beta=0.999,
729  curv_win_width=20,
730  zero_debias=True,
731  epsilon=0.1**6,
732  policy='fixed',
733  sparse_dedup_aggregator=None,
734  **kwargs):
735  super(YellowFinOptimizer, self).__init__()
736  self.alpha = alpha
737  self.mu = mu
738  self.beta = beta
739  self.curv_win_width = curv_win_width
740  self.zero_debias = zero_debias
741  self.epsilon = epsilon
742  self.policy = policy
743  self.sparse_dedup_aggregator = sparse_dedup_aggregator
744  self.init_kwargs = kwargs
745 
746  def _run(self, net, param_init_net, param_info):
747 
748  # Note: This is number of persistent scalars in YellowFin optimizer.
749  # It should always be the number of scalars being used. The same
750  # number should be used in class for the operation.
751  SCALARS_MEMORY_SIZE = 5
752 
753  param = param_info.blob
754  grad = param_info.grad
755  moment = param_init_net.ConstantFill(
756  [param],
757  param + "_moment",
758  value=0.0
759  )
760  curv_win = param_init_net.ConstantFill(
761  [],
762  param + "_curv_win",
763  shape=[self.curv_win_width],
764  value=0.0
765  )
766  g_avg = param_init_net.ConstantFill(
767  [param],
768  param + "_g_avg",
769  value=0.0
770  )
771  g2_avg = param_init_net.ConstantFill(
772  [param],
773  param + "_g2_avg",
774  value=0.0
775  )
776  lr_avg = param_init_net.ConstantFill(
777  [],
778  param + "_lr_avg",
779  shape=[1],
780  value=self.alpha
781  )
782  mu_avg = param_init_net.ConstantFill(
783  [],
784  param + "_mu_avg",
785  shape=[1],
786  value=self.mu
787  )
788  scalars_memory = param_init_net.ConstantFill(
789  [],
790  param + "_scalars_memory",
791  shape=[SCALARS_MEMORY_SIZE],
792  value=0.0
793  )
794 
795  assert self.alpha > 0
796  assert not isinstance(grad, core.GradientSlice), \
797  "YellowFin does not support sparse gradients"
798 
799  if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
800  # Add training operators.
801  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
802  iteration = param_init_net.ConstantFill(
803  [],
804  _OPTIMIZER_ITERATION_NAME,
805  shape=[1],
806  value=0,
807  dtype=core.DataType.INT64)
808  iter_mutex = param_init_net.CreateMutex([],
809  ["iteration_mutex"])
810  net.AtomicIter([iter_mutex, iteration], [iteration])
811  else:
812  iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
813 
814  self._aux_params.shared.append(iteration)
815  self._aux_params.local.append(moment)
816  self._aux_params.local.append(lr_avg)
817  self._aux_params.local.append(mu_avg)
818  self._aux_params.local.append(curv_win)
819  self._aux_params.local.append(g_avg)
820  self._aux_params.local.append(g2_avg)
821  self._aux_params.local.append(scalars_memory)
822 
823  yf_in_out_args = [
824  param,
825  moment,
826  lr_avg,
827  mu_avg,
828  curv_win,
829  g_avg,
830  g2_avg,
831  scalars_memory
832  ]
833 
834  net.YellowFin(
835  yf_in_out_args + [grad, iteration],
836  yf_in_out_args,
837  beta=self.beta,
838  epsilon=self.epsilon,
839  curv_win_width=self.curv_win_width,
840  zero_debias=self.zero_debias)
841 
842  def scale_learning_rate(self, scale):
843  self.alpha *= scale
844  return
845 
846 
848  def __init__(
849  self,
850  alpha=0.01,
851  decay=0.9,
852  momentum=0.0,
853  epsilon=1e-5,
854  policy='fixed',
855  engine='',
856  **kwargs
857  ):
858  super(RmsPropOptimizer, self).__init__()
859  self.alpha = alpha
860  self.decay = decay
861  self.momentum = momentum
862  self.epsilon = epsilon
863  self.policy = policy
864  self.engine = engine
865  self.init_kwargs = kwargs
866 
867  def _run(self, net, param_init_net, param_info):
868  param = param_info.blob
869  grad = param_info.grad
870 
871  assert self.alpha > 0
872  assert not isinstance(grad, core.GradientSlice), \
873  "RmsPropOptimizer doesn't support sparse gradients"
874 
875  dev = scope.CurrentDeviceScope()
876  if dev is None:
877  dev = core.DeviceOption(caffe2_pb2.CPU)
878 
879  ONE = param_init_net.ConstantFill(
880  [],
881  "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
882  shape=[1],
883  value=1.0
884  )
885 
886  lr, _ = self.build_lr(
887  net,
888  param_init_net,
889  base_learning_rate=-self.alpha,
890  policy=self.policy,
891  **(self.init_kwargs)
892  )
893 
894  grad_o = param_init_net.ConstantFill(
895  [param],
896  str(param) + "_grad_o",
897  values=0.0,
898  )
899 
900  ms = param_init_net.ConstantFill(
901  [param],
902  str(param) + "_mean_squares",
903  values=0.0,
904  )
905 
906  mom = param_init_net.ConstantFill(
907  [param],
908  str(param) + "_momentum",
909  values=0.0,
910  )
911 
912  self._aux_params.local.append(ms)
913  self._aux_params.local.append(mom)
914 
915  net.RmsProp(
916  [grad, ms, mom, ONE],
917  [grad_o, ms, mom],
918  decay=self.decay,
919  momentum=self.momentum,
920  epsilon=self.epsilon,
921  engine=self.engine,
922  )
923 
924  net.MomentumSGDUpdate(
925  [grad_o, mom, lr, param],
926  [grad_o, mom, param],
927  )
928 
929  def scale_learning_rate(self, scale):
930  self.alpha *= scale
931  return
932 
933 
934 def _get_param_to_device(model):
935  # Infer blob devices by going through the net and param_init_net
936  # ops and observing the device used to create or use the blob.
937  param_to_device = core.InferBlobDevices(model.net)
938  param_to_device.update(core.InferBlobDevices(model.param_init_net))
939  return param_to_device
940 
941 
942 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
943  device = default_device
944  param_to_device = param_to_device or {}
945  # We first check if parameter's device has been inferred. If not,
946  # we check the gradient. This can happen if parameter is not output
947  # by any blob but created by a FetchBlob.
948  if param_name in param_to_device:
949  device = param_to_device[param_name]
950  else:
951  if isinstance(grad, core.GradientSlice):
952  grad = grad
953  if str(grad.values) in param_to_device:
954  device = param_to_device[str(grad.values)]
955  elif str(grad.indices) in param_to_device:
956  device = param_to_device[str(grad.indices)]
957  else:
958  grad_name = str(grad)
959  if grad_name in param_to_device:
960  device = param_to_device[grad_name]
961 
962  assert device is not None,\
963  "Cannot infer device for {}: no op creates it".format(param_name)
964  return device
965 
966 
967 def get_lr_injection():
968  """
969  Gets current value for lr_injection, a multiplier for all base
970  learning rates.
971  Must set allow_lr_injection=True when building optimizer, as it
972  relies on synchronization over CPU.
973  """
974  return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
975 
976 
977 def set_lr_injection(lr_injection_value):
978  """
979  Sets lr_injection, a multiplier for all base learning rates.
980  Must set allow_lr_injection=True when building optimizer, as it
981  relies on synchronization over CPU.
982  """
983  workspace.FeedBlob(
984  _LEARNING_RATE_INJECTION,
985  np.array(
986  [float(lr_injection_value)],
987  dtype=np.float32,
988  ),
989  )
990 
991 
992 def _calc_norm_ratio(
993  model, params, name_scope, param_to_device, max_gradient_norm
994 ):
995  with core.NameScope(name_scope):
996  grad_squared_sums = []
997  for i, param in enumerate(params):
998  device = get_param_device(
999  str(param.blob), param.grad, param_to_device
1000  )
1001 
1002  with core.DeviceScope(device):
1003  grad = (
1004  param.grad
1005  if not isinstance(
1006  param.grad,
1007  core.GradientSlice,
1008  ) else param.grad.values
1009  )
1010 
1011  grad_squared_sum_name = 'grad_{}_squared_sum'.format(i)
1012  grad_squared_sum = model.net.SumSqrElements(
1013  grad,
1014  grad_squared_sum_name,
1015  )
1016  grad_squared_sum_cpu = model.net.EnsureCPUOutput(
1017  grad_squared_sum
1018  )
1019  grad_squared_sums.append(grad_squared_sum_cpu)
1020 
1021  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
1022  grad_squared_full_sum = model.net.Sum(
1023  grad_squared_sums,
1024  'grad_squared_full_sum',
1025  )
1026  global_norm = model.net.Pow(
1027  grad_squared_full_sum,
1028  'global_norm',
1029  exponent=0.5,
1030  )
1031  clip_norm = model.param_init_net.ConstantFill(
1032  [],
1033  'clip_norm',
1034  shape=[],
1035  value=float(max_gradient_norm),
1036  )
1037  max_norm = model.net.Max(
1038  [global_norm, clip_norm],
1039  'max_norm',
1040  )
1041  norm_ratio = model.net.Div(
1042  [clip_norm, max_norm],
1043  'norm_ratio',
1044  )
1045  return norm_ratio
1046 
1047 
1048 def _build(
1049  model,
1050  optimizer,
1051  weights_only=False,
1052  use_param_info_optim=True,
1053  max_gradient_norm=None,
1054  allow_lr_injection=False,
1055 ):
1056  param_to_device = _get_param_to_device(model)
1057 
1058  # Validate there are no duplicate params
1059  model.Validate()
1060 
1061  params = []
1062  for param_info in model.GetOptimizationParamInfo():
1063  if weights_only and param_info.blob not in model.weights:
1064  continue
1065  params.append(param_info)
1066 
1067  lr_multiplier = None
1068  if max_gradient_norm is not None:
1069  lr_multiplier = _calc_norm_ratio(
1070  model,
1071  params,
1072  'norm_clipped_grad_update',
1073  param_to_device,
1074  max_gradient_norm,
1075  )
1076 
1077  if allow_lr_injection:
1078  if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
1079  lr_injection = model.param_init_net.ConstantFill(
1080  [],
1081  _LEARNING_RATE_INJECTION,
1082  shape=[1],
1083  value=1.0,
1084  )
1085  else:
1086  lr_injection = _LEARNING_RATE_INJECTION
1087 
1088  if lr_multiplier is None:
1089  lr_multiplier = lr_injection
1090  else:
1091  lr_multiplier = model.net.Mul(
1092  [lr_multiplier, lr_injection],
1093  'lr_multiplier',
1094  broadcast=1,
1095  )
1096  optimizer.add_lr_multiplier(lr_multiplier)
1097 
1098  for param_info in params:
1099  param_name = str(param_info.blob)
1100 
1101  device = get_param_device(param_name, param_info.grad, param_to_device)
1102 
1103  with core.DeviceScope(device):
1104  if param_info.optimizer and use_param_info_optim:
1105  param_info.optimizer(model.net, model.param_init_net, param_info)
1106  else:
1107  optimizer(model.net, model.param_init_net, param_info)
1108  return optimizer
1109 
1110 
1111 def add_weight_decay(model, weight_decay):
1112  """Adds a decay to weights in the model.
1113 
1114  This is a form of L2 regularization.
1115 
1116  Args:
1117  weight_decay: strength of the regularization
1118  """
1119  _build(
1120  model,
1121  WeightDecayBuilder(weight_decay=weight_decay),
1122  weights_only=True,
1123  use_param_info_optim=False,
1124  )
1125 
1126 
1127 def build_sgd(
1128  model,
1129  base_learning_rate,
1130  max_gradient_norm=None,
1131  allow_lr_injection=False,
1132  **kwargs
1133 ):
1134  sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
1135  return _build(
1136  model,
1137  sgd_optimizer,
1138  max_gradient_norm=max_gradient_norm,
1139  allow_lr_injection=allow_lr_injection,
1140  )
1141 
1142 
1143 def build_multi_precision_sgd(
1144  model,
1145  base_learning_rate,
1146  max_gradient_norm=None,
1147  allow_lr_injection=False,
1148  **kwargs
1149 ):
1150  multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(
1151  base_learning_rate, **kwargs
1152  )
1153  return _build(
1154  model,
1155  multi_prec_sgd_optimizer,
1156  max_gradient_norm=max_gradient_norm,
1157  allow_lr_injection=allow_lr_injection,
1158  )
1159 
1160 
1161 def build_fp16_sgd(model, base_learning_rate, **kwargs):
1162  fp16_sgd_optimizer = FP16SgdOptimizer(
1163  base_learning_rate, **kwargs
1164  )
1165  return _build(model, fp16_sgd_optimizer)
1166 
1167 
1168 def build_ftrl(model, engine="SIMD", **kwargs):
1169  if engine == "SIMD":
1170  assert core.IsOperator('Ftrl_ENGINE_SIMD')
1171  assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
1172  ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
1173  return _build(model, ftrl_optimizer)
1174 
1175 
1176 def build_adagrad(
1177  model,
1178  base_learning_rate,
1179  parameters=None,
1180  max_gradient_norm=None,
1181  allow_lr_injection=False,
1182  **kwargs
1183 ):
1184  adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
1185  return _build(
1186  model,
1187  adagrad_optimizer,
1188  max_gradient_norm=max_gradient_norm,
1189  allow_lr_injection=allow_lr_injection,
1190  )
1191 
1192 
1193 def build_adam(
1194  model,
1195  base_learning_rate,
1196  max_gradient_norm=None,
1197  allow_lr_injection=False,
1198  **kwargs
1199 ):
1200  adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
1201  return _build(
1202  model,
1203  adam_optimizer,
1204  max_gradient_norm=max_gradient_norm,
1205  allow_lr_injection=allow_lr_injection,
1206  )
1207 
1208 
1209 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
1210  yellowfin_optimizer = YellowFinOptimizer(
1211  alpha=base_learning_rate,
1212  **kwargs)
1213  return _build(model, yellowfin_optimizer)
1214 
1215 
1216 def build_rms_prop(
1217  model,
1218  base_learning_rate,
1219  max_gradient_norm=None,
1220  allow_lr_injection=False,
1221  **kwargs
1222 ):
1223  rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
1224  return _build(
1225  model,
1226  rms_prop_optimizer,
1227  max_gradient_norm=max_gradient_norm,
1228  allow_lr_injection=allow_lr_injection,
1229  )
def get_cpu_blob_name(self, base_str, node_name='')
Definition: optimizer.py:59
Module caffe2.python.optimizer.
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
Definition: optimizer.py:63
def dedup(net, sparse_dedup_aggregator, grad)
Definition: optimizer.py:145
def _run(self, net, param_init_net, param_info)
Definition: optimizer.py:56
def make_unique_blob_name(self, base_str)
Definition: optimizer.py:69
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
Definition: optimizer.py:87
def add_lr_multiplier(self, lr_multiplier, is_gpu_blob=False)
Definition: optimizer.py:140