tensorflow_addons/optimizers/novograd.py [154:177]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)
        weight_decay = self._get_hyper("weight_decay", var_dtype)
        grad_averaging = self._get_hyper("grad_averaging")

        v = self.get_slot(var, "v")
        g_2 = tf.reduce_sum(tf.square(grad))
        v_t = tf.cond(
            tf.equal(self.iterations, 0),
            lambda: g_2,
            lambda: v * coefficients["beta_2_t"]
            + g_2 * coefficients["one_minus_beta_2_t"],
        )
        v_t = v.assign(v_t, use_locking=self._use_locking)

        if self.amsgrad:
            vhat = self.get_slot(var, "vhat")
            vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking)
            grad = grad / (tf.sqrt(vhat_t) + self.epsilon)
        else:
            grad = grad / (tf.sqrt(v_t) + self.epsilon)
        grad = tf.cond(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



tensorflow_addons/optimizers/novograd.py [197:221]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = (apply_state or {}).get(
            (var_device, var_dtype)
        ) or self._fallback_apply_state(var_device, var_dtype)
        weight_decay = self._get_hyper("weight_decay", var_dtype)
        grad_averaging = self._get_hyper("grad_averaging")

        v = self.get_slot(var, "v")
        g_2 = tf.reduce_sum(tf.square(grad))
        # v is just a scalar and does not need to involve sparse tensors.
        v_t = tf.cond(
            tf.equal(self.iterations, 0),
            lambda: g_2,
            lambda: v * coefficients["beta_2_t"]
            + g_2 * coefficients["one_minus_beta_2_t"],
        )
        v_t = v.assign(v_t, use_locking=self._use_locking)

        if self.amsgrad:
            vhat = self.get_slot(var, "vhat")
            vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking)
            grad = grad / (tf.sqrt(vhat_t) + self.epsilon)
        else:
            grad = grad / (tf.sqrt(v_t) + self.epsilon)
        grad = tf.cond(
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



