in DST/learn.py [0:0]
def get_optimizers(model, args, train_num, optimizer_name, specify_adafactor_lr):
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": args.weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
overall_batch_size = args.number_of_gpu * args.batch_size_per_gpu * args.gradient_accumulation_steps
num_training_steps = train_num * args.epoch_num // overall_batch_size
print ('----------')
if optimizer_name == 'adam':
print ('Use Adam Optimizer for Training.')
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_training_steps)
elif optimizer_name == 'adafactor':
from transformers.optimization import Adafactor, AdafactorSchedule
print ('Use Adafactor Optimizer for Training.')
if specify_adafactor_lr:
print ('Specific learning rate.')
optimizer = Adafactor(
optimizer_grouped_parameters,
lr=1e-3,
eps=(1e-30, 1e-3),
clip_threshold=1.0,
decay_rate=-0.8,
beta1=None,
weight_decay=0.0,
relative_step=False,
scale_parameter=False,
warmup_init=False
)
scheduler = None
else:
print ('Do not specific learning rate.')
optimizer = Adafactor(optimizer_grouped_parameters,
scale_parameter=True,
relative_step=True,
warmup_init=True,
lr=None)
scheduler = AdafactorSchedule(optimizer)
else:
raise Exception('Wrong Optimizer Name!!!')
print ('----------')
return optimizer, scheduler