in pycls/core/optimizer.py [0:0]
def construct_optimizer(model):
"""Constructs the optimizer.
Note that the momentum update in PyTorch differs from the one in Caffe2.
In particular,
Caffe2:
V := mu * V + lr * g
p := p - V
PyTorch:
V := mu * V + g
p := p - lr * V
where V is the velocity, mu is the momentum factor, lr is the learning rate,
g is the gradient and p are the parameters.
Since V is defined independently of the learning rate in PyTorch,
when the learning rate is changed there is no need to perform the
momentum correction by scaling V (unlike in the Caffe2 case).
"""
# Split parameters into types and get weight decay for each type
optim, wd, params = cfg.OPTIM, cfg.OPTIM.WEIGHT_DECAY, [[], [], [], []]
for n, p in model.named_parameters():
ks = [k for (k, x) in enumerate(["bn", "ln", "bias", ""]) if x in n]
params[ks[0]].append(p)
wds = [
cfg.BN.CUSTOM_WEIGHT_DECAY if cfg.BN.USE_CUSTOM_WEIGHT_DECAY else wd,
cfg.LN.CUSTOM_WEIGHT_DECAY if cfg.LN.USE_CUSTOM_WEIGHT_DECAY else wd,
optim.BIAS_CUSTOM_WEIGHT_DECAY if optim.BIAS_USE_CUSTOM_WEIGHT_DECAY else wd,
wd,
]
param_wds = [{"params": p, "weight_decay": w} for (p, w) in zip(params, wds) if p]
# Set up optimizer
if optim.OPTIMIZER == "sgd":
optimizer = torch.optim.SGD(
param_wds,
lr=optim.BASE_LR,
momentum=optim.MOMENTUM,
weight_decay=wd,
dampening=optim.DAMPENING,
nesterov=optim.NESTEROV,
)
elif optim.OPTIMIZER == "adam":
optimizer = torch.optim.Adam(
param_wds,
lr=optim.BASE_LR,
betas=(optim.BETA1, optim.BETA2),
weight_decay=wd,
)
elif optim.OPTIMIZER == "adamw":
optimizer = torch.optim.AdamW(
param_wds,
lr=optim.BASE_LR,
betas=(optim.BETA1, optim.BETA2),
weight_decay=wd,
)
else:
raise NotImplementedError
return optimizer