in models.py [0:0]
def get_bert_optim(network, lr, weight_decay):
no_decay = ["bias", "LayerNorm.weight"]
decay_params = []
nodecay_params = []
for n, p in network.named_parameters():
if any(nd in n for nd in no_decay):
decay_params.append(p)
else:
nodecay_params.append(p)
optimizer_grouped_parameters = [
{
"params": decay_params,
"weight_decay": weight_decay,
},
{
"params": nodecay_params,
"weight_decay": 0.0,
},
]
optimizer = AdamW(
optimizer_grouped_parameters,
lr=lr,
eps=1e-8)
return optimizer