in optim.py [0:0]
def adam2_old(params, cost_or_grads, lr=3e-4, mom1=0.9, mom2=0.999, epsilon=1e-8):
updates = []
if type(cost_or_grads) is not list:
gs = tf.gradients(cost_or_grads, params)
else:
gs = cost_or_grads
# all-reduce
grads1 = [Z.allreduce_mean(g) for g in gs]
grads2 = [Z.allreduce_mean(tf.square(g)) for g in gs]
mom2 = tf.maximum(0., 1. - (hvd.size() * (1 - mom2)))
t = tf.Variable(1., 'adam_t')
lr_t = lr * tf.sqrt((1. - tf.pow(mom2, t))) / (1. - tf.pow(mom1, t))
updates.append(t.assign_add(1))
for p, g1, g2 in zip(params, grads1, grads2):
mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg')
if mom1 > 0:
v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v')
v_t = mom1 * v + (1. - mom1) * g1
updates.append(v.assign(v_t))
else:
v_t = g1
mg_t = mom2 * mg + (1. - mom2) * g2
delta_t = v_t / (tf.sqrt(mg_t) + epsilon)
p_t = p - lr_t * delta_t
updates.append(mg.assign(mg_t))
updates.append(p.assign(p_t))
return tf.group(*updates)