tutorial/deprecated/tutorial_from_reinforce_to_a2c/a2c.py [127:194]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                self.train_batcher.execute()
                trajectories = self.train_batcher.get(blocking=True)
            # 3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss = floss / n_episodes * trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            # Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration += 1

            # We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None:  # trajectories are available
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories["_reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "At iteration %d, reward is %f"
                    % (self.evaluation_iteration, cumulated_reward.item())
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask
        max_length = trajectories.lengths.max().item()
        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



tutorial/deprecated/tutorial_recurrent_policy/a2c.py [118:186]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            self.train_batcher.execute()
            trajectories = self.train_batcher.get(blocking=True)

            # 3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["critic_coef"] * dt["critic_loss"]
            lr = self.config["a2c_coef"] * dt["a2c_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr
            floss = floss / n_episodes * trajectories.n_elems()

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            # Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration += 1

            # We check the evaluation batcher
            evaluation_trajectories = self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None:  # trajectories are available
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories["_reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "At iteration %d, reward is %f"
                    % (self.evaluation_iteration, cumulated_reward.item())
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask
        max_length = trajectories.lengths.max().item()
        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = []
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



