Caffe2 - Python API
A deep learning, cross platform ML framework
gradient_clipping.py
1 from __future__ import absolute_import
2 from __future__ import division
3 from __future__ import print_function
4 from __future__ import unicode_literals
5 
6 from caffe2.python import core
7 from caffe2.proto import caffe2_pb2
8 from caffe2.python.optimizer import get_param_device
9 from caffe2.python.modeling.net_modifier import NetModifier
10 
11 import logging
12 
13 logger = logging.getLogger(__name__)
14 
15 
17 
18  L1_NORM = 'l1_norm'
19  L2_NORM = 'l2_norm'
20 
21  BY_NORM = 'by_norm'
22 
23  GRAD_CLIP_METHODS = [BY_NORM]
24  CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
25 
26  def __init__(self, grad_clip_method, clip_norm_type, clip_threshold,
27  use_parameter_norm=False, compute_norm_ratio=False):
28  """
29  Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
30 
31  Args:
32  grad_clip_method: ways to clip the gradients
33  clip_norm_type: type of norm used in the necessary computation
34  clip_threshold: threshold used to determine whether to clip
35  use_parameter_norm: a boolean to indicate whether to incorporate
36  the norm of the parameter
37  compute_norm_ratio: a boolean to compute the ratio between gradient norm
38  and parameter norm explicitly for debugging purpose
39  """
40 
41  assert grad_clip_method in self.GRAD_CLIP_METHODS, (
42  "This method of clipping, {}, has not been implemented.".format(
43  clip_norm_type))
44 
45  assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
46  "This method of clipping, {}, has not been implemented.".format(
47  clip_norm_type))
48 
49  self.grad_clip_method = grad_clip_method
50  self.clip_norm_type = clip_norm_type
51  self.clip_threshold = float(clip_threshold)
52  self.use_parameter_norm = use_parameter_norm
53  self.compute_norm_ratio = compute_norm_ratio
54 
55  def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None):
56 
57  assert grad_map is not None
58 
59  CPU = core.DeviceOption(caffe2_pb2.CPU)
60 
61  for param, grad in grad_map.items():
62 
63  # currently sparse gradients won't be clipped
64  # futher implementation is needed to enable it
65  if isinstance(grad, core.GradientSlice):
66  continue
67 
68  device = get_param_device(
69  param,
70  grad_map[str(param)],
71  param_to_device=blob_to_device,
72  default_device=CPU,
73  )
74 
75  with core.DeviceScope(device):
76  if self.grad_clip_method == self.BY_NORM:
77  if self.clip_norm_type == self.L2_NORM:
78  p = 2
79  elif self.clip_norm_type == self.L1_NORM:
80  p = 1
81 
82  grad_norm = net.LpNorm(
83  [grad],
84  net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
85  p=p,
86  )
87 
88  if p == 2:
89  grad_norm = net.Pow([grad_norm], exponent=0.5)
90 
91  op_inputs = [grad, grad_norm]
92 
93  if self.use_parameter_norm:
94  param_norm = net.LpNorm(
95  [param],
96  net.NextScopedBlob(
97  prefix=str(param) + '_l{}_norm'.format(p)),
98  p=p,
99  )
100 
101  if p == 2:
102  param_norm = net.Pow([param_norm], exponent=0.5)
103 
104  op_inputs.append(param_norm)
105 
106  if self.compute_norm_ratio:
107  net.Div(
108  [grad_norm, param_norm],
109  [net.NextScopedBlob(
110  prefix=str(param) + '_norm_ratio')]
111  )
112 
113  net.ClipTensorByScaling(
114  op_inputs,
115  [grad],
116  threshold=self.clip_threshold,
117  )
def __init__(self, grad_clip_method, clip_norm_type, clip_threshold, use_parameter_norm=False, compute_norm_ratio=False)