in abstractive_summarization/src/others/optimizer.py [0:0]
def build_optim(args, model, checkpoint, pretrained_model=None):
""" Build optimizer """
if args.recadam:
print("Using RecAdam")
no_decay = ["bias", "layer_norm.weight", "layernorm_embedding.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if
not any(nd in n for nd in no_decay) and args.model_type in n],
"weight_decay": args.weight_decay,
"anneal_w": args.anneal_w,
"pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if
not any(nd in p_n for nd in no_decay) and args.model_type in p_n]
},
{
"params": [p for n, p in model.named_parameters() if
not any(nd in n for nd in no_decay) and args.model_type not in n],
"weight_decay": args.weight_decay,
"anneal_w": 0.0,
"pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if
not any(nd in p_n for nd in no_decay) and args.model_type not in p_n]
},
{
"params": [p for n, p in model.named_parameters() if
any(nd in n for nd in no_decay) and args.model_type in n],
"weight_decay": 0.0,
"anneal_w": args.anneal_w,
"pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if
any(nd in p_n for nd in no_decay) and args.model_type in p_n]
},
{
"params": [p for n, p in model.named_parameters() if
any(nd in n for nd in no_decay) and args.model_type not in n],
"weight_decay": 0.0,
"anneal_w": 0.0,
"pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if
any(nd in p_n for nd in no_decay) and args.model_type not in p_n]
}
]
optim = RecAdam(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon, anneal_fun=args.anneal_fun, anneal_k=args.anneal_k, anneal_t0=args.anneal_t0, pretrain_cof=args.pretrain_cof)
else:
optim = Optimizer(
args.optim, args.lr, args.max_grad_norm,
beta1=args.beta1, beta2=args.beta2,
decay_method=args.decay_method,
warmup_steps=args.warmup_steps, model_size=args.enc_hidden_size)
optim.set_parameters(list(model.named_parameters()))
if args.train_from != '' and 'xsum' not in args.train_from:
optim.optimizer.load_state_dict(checkpoint['optim'])
if args.visible_gpu != '-1':
for state in optim.optimizer.state.values():
for k, v in state.items():
if torch.is_tensor(v):
state[k] = v.cuda()
if (optim.method == 'adam') and (len(optim.optimizer.state) < 1):
raise RuntimeError(
"Error: loaded Adam optimizer from existing model" +
" but optimizer state is empty")
return optim