in jukebox/utils/fp16.py [0:0]
def step(self, closure=None, scale=1.0):
"""Performs a single optimization step. Scales gradients down by scale
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
bias_correction = 1 if group["bias_correction"] else 0
for p in group["params"]:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
# State initialization
if len(state) == 0:
state["step"] = 0
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(p.data).float()
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros_like(p.data).float()
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
beta1, beta2 = group["betas"]
state["step"] += 1
out_p = torch.tensor([], dtype=torch.float)
fused_adam_step(
p.data,
out_p,
exp_avg,
exp_avg_sq,
grad,
group["lr"],
beta1,
beta2,
group["eps"],
scale,
state["step"],
self.eps_mode,
bias_correction,
group["weight_decay"],
)
return loss