in src/run_fusion_in_decoder.py [0:0]
def configure_optimizers(self):
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": self.hparams.optim.weight_decay,
},
{"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.hparams.optim.adam_epsilon)
scheduler = {
'scheduler': None,
'monitor': 'val_loss', # Default: val_loss
'interval': 'step', # step or epoch
'frequency': 1
}
scheduler['scheduler'] = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.t_total
)
return [optimizer], [scheduler]