in metaicl/model.py [0:0]
def setup_optimizer(self, optimization, num_training_steps, lr, weight_decay, warmup_steps):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
{'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if optimization=="adafactor":
optimizer = Adafactor(optimizer_grouped_parameters,
lr=lr,
relative_step=False,
warmup_init=False,
weight_decay=weight_decay)
scheduler = None
elif optimization.startswith("adamw"):
optimizer = AdamW(optimizer_grouped_parameters,
lr=lr,
eps=1e-08,
weight_decay=weight_decay)
if self.fp16:
self.model, optimizer = setup_fp16(self.model, optimizer)
if optimization=="adamw":
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps)
else:
raise NotImplementedError()
elif optimization=="8bit-adam":
import bitsandbytes as bnb
optimizer = bnb.optim.Adam8bit(optimizer_grouped_parameters,
lr=lr, betas=(0.9, 0.995))
if self.fp16:
self.model, optimizer = setup_fp16(self.model, optimizer)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=num_training_steps)
else:
raise NotImplementedError()
self.optimizer = optimizer
self.scheduler = scheduler