in gossip_sgd_adpsgd.py [0:0]
def update_learning_rate(optimizer, epoch, itr=None, itr_per_epoch=None,
scale=1):
"""
1) Linearly warmup to reference learning rate (5 epochs)
2) Decay learning rate exponentially (epochs 30, 60, 80)
** note: args.lr is the reference learning rate from which to scale up
** note: minimum global batch-size is 256
"""
target_lr = args.lr * args.batch_size * scale * args.world_size / 256
if args.global_itr is not None and args.global_epoch is not None:
epoch = args.global_epoch
itr_per_epoch *= args.world_size
itr = args.global_itr % itr_per_epoch
lr = None
if args.warmup and epoch < 5: # warmup to scaled lr
if target_lr <= args.lr:
lr = target_lr
else:
assert itr is not None and itr_per_epoch is not None
count = epoch * itr_per_epoch + itr + 1
incr = (target_lr - args.lr) * (count / (5 * itr_per_epoch))
lr = args.lr + incr
else:
lr = target_lr
for e in args.lr_schedule:
if epoch >= e:
lr *= args.lr_schedule[e]
if lr is not None:
log.debug('Updating learning rate to {}'.format(lr))
for param_group in optimizer.param_groups:
param_group['lr'] = lr