rlalgos/ppo/discrete_ppo.py [203:248]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        action_probabilities = replayed["action_probabilities"]
        # We compute the temporal difference
        gae = self.get_gae(
            trajectories["_observation/done"],
            reward,
            replayed["critic"].squeeze(-1),
            replayed["_critic"].squeeze(-1),
            self.config["discount_factor"],
            self.config["gae_coef"],
        )
        td = gae

        critic_loss = td ** 2
        avg_critic_loss = critic_loss.mean()

        action_distribution = torch.distributions.Categorical(action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action/action"])
        a2c_loss = log_proba * td.detach()
        avg_a2c_loss = a2c_loss.mean()

        entropy = action_distribution.entropy()
        avg_entropy = entropy.mean()

        return DictTensor(
            {
                "critic_loss": avg_critic_loss,
                "a2c_loss": avg_a2c_loss,
                "entropy_loss": avg_entropy,
            }
        )

    def get_gae(self, done, reward, critic, _critic, discount_factor=1, _lambda=0):
        r = reward
        d = done.float()
        target = r + discount_factor * _critic.detach() * (1.0 - d)

        delta = target - critic
        T = done.size()[1]
        gae = delta[:, -1]
        gaes = [gae]
        for t in range(T - 2, -1, -1):
            gae = delta[:, t] + discount_factor * _lambda * (1 - d[:, t]) * gae
            gaes.append(gae)
        gaes = list([g.unsqueeze(-1) for g in reversed(gaes)])
        fgae = torch.cat(gaes, dim=1)
        return fgae
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


tutorial/deprecated/tutorial_recurrent_a2c_gae_s/a2c.py [212:259]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        action_probabilities = replayed["action_probabilities"]
        # We compute the temporal difference
        gae = self.get_gae(
            trajectories["_observation/done"],
            reward,
            replayed["critic"].squeeze(-1),
            replayed["_critic"].squeeze(-1),
            self.config["discount_factor"],
            self.config["gae_coef"],
        )
        td = gae

        critic_loss = td ** 2
        avg_critic_loss = critic_loss.mean()

        action_distribution = torch.distributions.Categorical(action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action/action"])
        a2c_loss = log_proba * td.detach()
        avg_a2c_loss = a2c_loss.mean()

        entropy = action_distribution.entropy()
        avg_entropy = entropy.mean()

        return DictTensor(
            {
                "critic_loss": avg_critic_loss,
                "a2c_loss": avg_a2c_loss,
                "entropy_loss": avg_entropy,
            }
        )

    def get_gae(self, done, reward, critic, _critic, discount_factor=1, _lambda=0):
        r = reward
        d = done.float()
        target = r + discount_factor * _critic.detach() * (1.0 - d)
        # if d[0].sum()>0:
        #     print("C: ",critic[0])
        #     print("T: ",target[0])
        delta = target - critic
        T = done.size()[1]
        gae = delta[:, -1]
        gaes = [gae]
        for t in range(T - 2, -1, -1):
            gae = delta[:, t] + discount_factor * _lambda * (1 - d[:, t]) * gae
            gaes.append(gae)
        gaes = list([g.unsqueeze(-1) for g in reversed(gaes)])
        fgae = torch.cat(gaes, dim=1)
        return fgae
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -