tutorial/deprecated/tutorial_from_reinforce_to_a2c_s/a2c.py [156:209]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            optimizer.step()

            # Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration += 1

            # We check the evaluation batcher
            evaluation_trajectories, info = self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None:  # trajectories are available
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories["_observation/reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "At iteration %d, reward is %f"
                    % (self.evaluation_iteration, cumulated_reward.item())
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories, info):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_observation/reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask
        max_length = trajectories.lengths.max().item()
        replayed = replay_agent(self.agent, trajectories, info)

        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = replayed["action_probabilities"]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


tutorial/deprecated/tutorial_recurrent_a2c_gae_s/a2c.py [159:212]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            optimizer.step()

            # Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            self.iteration += 1

            # We check the evaluation batcher
            evaluation_trajectories, info = self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None:  # trajectories are available
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories["_observation/reward"]
                        * evaluation_trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "At iteration %d, reward is %f"
                    % (self.evaluation_iteration, cumulated_reward.item())
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                self.evaluation_batcher.reexecute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()

    def get_loss(self, trajectories, info):
        # First, we want to compute the cumulated reward per trajectory
        # The reward is a t+1 in each iteration (since it is btained after the aaction), so we use the '_reward' field in the trajectory
        # The 'reward' field corresopnds to the reward at time t
        reward = trajectories["_observation/reward"]

        # We get the mask that tells which transition is in a trajectory (1) or not (0)
        mask = trajectories.mask()

        # We remove the reward values that are not in the trajectories
        reward = reward * mask
        max_length = trajectories.lengths.max().item()

        replayed = replay_agent(self.agent, trajectories, info)
        # Now, we want to compute the action probabilities over the trajectories such that we will be able to do 'backward'
        action_probabilities = replayed["action_probabilities"]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -