tutorial/deprecated/tutorial_reinforce/reinforce.py [125:194]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask

        # We compute the future cumulated reward at each timestep (by reverse computation)
        max_length = trajectories.lengths.max().item()
        cumulated_reward = torch.zeros_like(reward)
        cumulated_reward[:, max_length - 1] = reward[:, max_length - 1]
        for t in range(max_length - 2, -1, -1):
            cumulated_reward[:, t] = (
                reward[:, t]
                + self.config["discount_factor"] * cumulated_reward[:, t + 1]
            )

        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
        for t in range(max_length):
            proba = self.learning_model(trajectories["frame"][:, t])
            action_probabilities.append(
                proba.unsqueeze(1)
            )  # We append the probability, and introduces the temporal dimension (2nde dimension)
        action_probabilities = torch.cat(
            action_probabilities, dim=1
        )  # Now, we have a B x T x n_actions tensor

        # We compute the baseline
        baseline = []
        for t in range(max_length):
            b = self.baseline_model(trajectories["frame"][:, t])
            baseline.append(b.unsqueeze(1))
        baseline = torch.cat(baseline, dim=1).squeeze(-1)  # Now, we have a B x T tensor

        # We compute the baseline loss
        baseline_loss = (baseline - cumulated_reward) ** 2
        # We sum the loss for each episode (considering the mask)
        baseline_loss = (baseline_loss * mask).sum(1) / mask.sum(1)
        # We average the loss over all the trajectories
        avg_baseline_loss = baseline_loss.mean()

        # We do the same on the reinforce loss
        action_distribution = torch.distributions.Categorical(action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action"])
        reinforce_loss = log_proba * (cumulated_reward - baseline).detach()
        reinforce_loss = (reinforce_loss * mask).sum(1) / mask.sum(1)
        avg_reinforce_loss = reinforce_loss.mean()

        # We compute the entropy loss
        entropy = action_distribution.entropy()
        entropy = (entropy * mask).sum(1) / mask.sum(1)
        avg_entropy = entropy.mean()

        return DictTensor(
            {
                "avg_reward": cumulated_reward[:, 0].mean(),
                "baseline_loss": avg_baseline_loss,
                "reinforce_loss": avg_reinforce_loss,
                "entropy_loss": avg_entropy,
            }
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


tutorial/deprecated/tutorial_reinforce_with_evaluation/reinforce.py [178:247]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask

        # We compute the future cumulated reward at each timestep (by reverse computation)
        max_length = trajectories.lengths.max().item()
        cumulated_reward = torch.zeros_like(reward)
        cumulated_reward[:, max_length - 1] = reward[:, max_length - 1]
        for t in range(max_length - 2, -1, -1):
            cumulated_reward[:, t] = (
                reward[:, t]
                + self.config["discount_factor"] * cumulated_reward[:, t + 1]
            )

        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
        for t in range(max_length):
            proba = self.learning_model(trajectories["frame"][:, t])
            action_probabilities.append(
                proba.unsqueeze(1)
            )  # We append the probability, and introduces the temporal dimension (2nde dimension)
        action_probabilities = torch.cat(
            action_probabilities, dim=1
        )  # Now, we have a B x T x n_actions tensor

        # We compute the baseline
        baseline = []
        for t in range(max_length):
            b = self.baseline_model(trajectories["frame"][:, t])
            baseline.append(b.unsqueeze(1))
        baseline = torch.cat(baseline, dim=1).squeeze(-1)  # Now, we have a B x T tensor

        # We compute the baseline loss
        baseline_loss = (baseline - cumulated_reward) ** 2
        # We sum the loss for each episode (considering the mask)
        baseline_loss = (baseline_loss * mask).sum(1) / mask.sum(1)
        # We average the loss over all the trajectories
        avg_baseline_loss = baseline_loss.mean()

        # We do the same on the reinforce loss
        action_distribution = torch.distributions.Categorical(action_probabilities)
        log_proba = action_distribution.log_prob(trajectories["action"])
        reinforce_loss = log_proba * (cumulated_reward - baseline).detach()
        reinforce_loss = (reinforce_loss * mask).sum(1) / mask.sum(1)
        avg_reinforce_loss = reinforce_loss.mean()

        # We compute the entropy loss
        entropy = action_distribution.entropy()
        entropy = (entropy * mask).sum(1) / mask.sum(1)
        avg_entropy = entropy.mean()

        return DictTensor(
            {
                "avg_reward": cumulated_reward[:, 0].mean(),
                "baseline_loss": avg_baseline_loss,
                "reinforce_loss": avg_reinforce_loss,
                "entropy_loss": avg_entropy,
            }
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -