higher/optim.py [347:376]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                state['exp_avg'] = exp_avg = (exp_avg * beta1) + (1 - beta1) * g
                state['exp_avg_sq'] = exp_avg_sq = (
                    (exp_avg_sq * beta2) + (1 - beta2) * g * g
                )

                # Deal with stability issues
                mask = exp_avg_sq == 0.
                _maybe_mask(exp_avg_sq, mask)

                if amsgrad:
                    # Maintains the max of all 2nd moment running avg. till now
                    state['max_exp_avg_sq'] = max_exp_avg_sq = _torch.max(
                        max_exp_avg_sq, exp_avg_sq
                    )
                    # Use the max. for normalizing running avg. of gradient
                    denom = _add(
                        max_exp_avg_sq.sqrt() / _math.sqrt(bias_correction2),
                        group['eps']
                    )
                else:
                    denom = _add(
                        exp_avg_sq.sqrt() / _math.sqrt(bias_correction2),
                        group['eps']
                    )

                step_size = group['lr'] / bias_correction1

                group['params'][p_idx] = _addcdiv(
                    p, -step_size, exp_avg, denom
                )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



higher/optim.py [425:454]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                state['exp_avg'] = exp_avg = (exp_avg * beta1) + (1 - beta1) * g
                state['exp_avg_sq'] = exp_avg_sq = (
                    (exp_avg_sq * beta2) + (1 - beta2) * g * g
                )

                # Deal with stability issues
                mask = exp_avg_sq == 0.
                _maybe_mask(exp_avg_sq, mask)

                if amsgrad:
                    # Maintains the max of all 2nd moment running avg. till now
                    state['max_exp_avg_sq'] = max_exp_avg_sq = _torch.max(
                        max_exp_avg_sq, exp_avg_sq
                    )
                    # Use the max. for normalizing running avg. of gradient
                    denom = _add(
                        max_exp_avg_sq.sqrt() / _math.sqrt(bias_correction2),
                        group['eps']
                    )
                else:
                    denom = _add(
                        exp_avg_sq.sqrt() / _math.sqrt(bias_correction2),
                        group['eps']
                    )

                step_size = group['lr'] / bias_correction1

                group['params'][p_idx] = _addcdiv(
                    p, -step_size, exp_avg, denom
                )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



