in optimizer.py [0:0]
def bs_adam(grads, variables, learning_rate, beta2=0.999,
grad_scale=1.0, max_grad_norm=1.0,
fp16_mean_var=True, static_loss_scaling=False, **kwargs):
# set to large value to disable clipping, but still collect global norm
# we also use this for dynamic loss scaling
if not max_grad_norm:
max_grad_norm = 9e9
if static_loss_scaling:
global_norm, norm_scale = bs.clip_by_global_norm(grads,
grad_scale=grad_scale,
clip_norm=max_grad_norm,
saturate=65504.0,
zero_nans=True)
else:
# We first calculate whether its nan, then also clip.
global_norm, _ = bs.clip_by_global_norm(grads,
grad_scale=grad_scale,
clip_norm=max_grad_norm)
# Try zeroing infs.
grads = [bs.filter_tensor(g, zero_infs=True, zero_nans=True) for g in grads]
_, norm_scale = bs.clip_by_global_norm(grads,
grad_scale=grad_scale,
clip_norm=max_grad_norm)
adam = bs.AdamOptimizer(
learning_rate=learning_rate,
beta2=beta2,
norm_scale=norm_scale,
grad_scale=grad_scale,
fp16=fp16_mean_var,
zero_init_variables=mpi_rank() != 0,
saturate=65504.0, zero_nans=True)
return adam.apply_gradients(zip(grads, variables)), global_norm