1 from __future__
import absolute_import
2 from __future__
import division
3 from __future__
import print_function
4 from __future__
import unicode_literals
7 from caffe2.proto
import caffe2_pb2
13 logger = logging.getLogger(__name__)
23 GRAD_CLIP_METHODS = [BY_NORM]
24 CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
26 def __init__(self, grad_clip_method, clip_norm_type, clip_threshold,
27 use_parameter_norm=
False, compute_norm_ratio=
False):
29 Clips gradient to avoid gradient magnitude explosion or vanishing gradient. 32 grad_clip_method: ways to clip the gradients 33 clip_norm_type: type of norm used in the necessary computation 34 clip_threshold: threshold used to determine whether to clip 35 use_parameter_norm: a boolean to indicate whether to incorporate 36 the norm of the parameter 37 compute_norm_ratio: a boolean to compute the ratio between gradient norm 38 and parameter norm explicitly for debugging purpose 42 "This method of clipping, {}, has not been implemented.".format(
46 "This method of clipping, {}, has not been implemented.".format(
55 def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None):
57 assert grad_map
is not None 59 CPU = core.DeviceOption(caffe2_pb2.CPU)
61 for param, grad
in grad_map.items():
65 if isinstance(grad, core.GradientSlice):
68 device = get_param_device(
71 param_to_device=blob_to_device,
75 with core.DeviceScope(device):
82 grad_norm = net.LpNorm(
84 net.NextScopedBlob(prefix=str(grad) +
'_l{}_norm'.format(p)),
89 grad_norm = net.Pow([grad_norm], exponent=0.5)
91 op_inputs = [grad, grad_norm]
94 param_norm = net.LpNorm(
97 prefix=str(param) +
'_l{}_norm'.format(p)),
102 param_norm = net.Pow([param_norm], exponent=0.5)
104 op_inputs.append(param_norm)
108 [grad_norm, param_norm],
110 prefix=str(param) +
'_norm_ratio')]
113 net.ClipTensorByScaling(
list CLIP_GRADIENT_NORM_TYPES
def __init__(self, grad_clip_method, clip_norm_type, clip_threshold, use_parameter_norm=False, compute_norm_ratio=False)