in benchmarks/horovod-resnet/train_imagenet_resnet_hvd.py [0:0]
def apply_gradients(self, gradvars, *args, **kwargs):
v_list = [tf.norm(tensor=v, ord=2) for _, v in gradvars]
g_list = [tf.norm(tensor=g, ord=2) if g is not None else 0.0 for g, _ in gradvars]
v_norms = tf.stack(v_list)
g_norms = tf.stack(g_list)
zeds = tf.zeros_like(v_norms)
# assign epsilon if weights or grads = 0, to avoid division by zero
# also prevent biases to get stuck at initialization (0.)
cond = tf.logical_and(tf.not_equal(v_norms, zeds), tf.not_equal(g_norms, zeds))
true_vals = tf.scalar_mul(self._eta, tf.div(v_norms, g_norms))
# true_vals = tf.scalar_mul(tf.cast(self._eta, tf.float32), tf.div(tf.cast(v_norms, tf.float32), tf.cast(g_norms, tf.float32)))
false_vals = tf.fill(tf.shape(v_norms), self._epsilon)
larc_local_lr = tf.where(cond, true_vals, false_vals)
if self._clip:
ones = tf.ones_like(v_norms)
lr = tf.fill(tf.shape(v_norms), self._learning_rate)
# We need gradients to compute local learning rate,
# so compute_gradients from initial optimizer have to called
# for which learning rate is already fixed
# We then have to scale the gradients instead of the learning rate.
larc_local_lr = tf.minimum(tf.div(larc_local_lr, lr), ones)
gradvars = [
(tf.multiply(larc_local_lr[i], g), v) if g is not None else (None, v)
for i, (g, v) in enumerate(gradvars)
]
return self._optimizer.apply_gradients(gradvars, *args, **kwargs)