def run()

in rlalgos/reinforce_diayn/reinforce_diayn.py [0:0]


    def run(self):
        # Creation of one env instance to get the dimensionnality of observations and number of actions
        env = self._create_env(
            self.config["n_envs"], seed=0, env_name=self.config["env_name"]
        )
        self.n_actions = env.action_space.n
        self.obs_dim = env.reset()[0]["frame"].size()[1]
        del env

        # Create the agent model
        self.learning_model = self._create_model()
        self.discriminator = self._create_discriminator()

        # Create one agent for loss computation (see get_loss)
        self.agent = self._create_agent(
            n_actions=self.n_actions, model=self.learning_model
        )

        # We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = RL_Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_evaluation_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_evaluation_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_processes"])
            ],
            agent_info=DictTensor(
                {
                    "stochastic": torch.tensor([True]),
                    "idx_policy": torch.zeros(1).long(),
                }
            ),
            env_info=DictTensor({}),
        )

        # Create a batcher to sample learning trajectories
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = RL_Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_processes"])
            ],
            agent_info=DictTensor(
                {
                    "stochastic": torch.tensor([True]),
                    "idx_policy": torch.zeros(1).long(),
                }
            ),
            env_info=DictTensor({}),
        )

        # Creation of the optimizer
        optimizer = torch.optim.RMSprop(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        optimizer_d = torch.optim.RMSprop(
            self.discriminator.parameters(), lr=self.config["lr_discriminator"]
        )

        # Training Loop:
        _start_time = time.time()
        self.iteration = 0

        # We launch the evaluation batcher, such that it starts to collect trajectories with the current model
        n_episodes = (
            self.config["n_evaluation_processes"] * self.config["n_evaluation_envs"]
        )
        agent_info = DictTensor(
            {
                "stochastic": torch.tensor(
                    [self.config["evaluation_mode"] == "stochastic"]
                ).repeat(n_episodes)
            }
        )
        agent_info.set(
            "idx_policy", torch.randint(self.config["n_policies"], size=(n_episodes,))
        )
        self.evaluation_batcher.reset(agent_info=agent_info)
        self.evaluation_batcher.execute()
        self.evaluation_iteration = self.iteration

        # Update the batcher with the last version of the learning model
        self.train_batcher.update(self.learning_model.state_dict())

        n_interactions = 0
        while time.time() - _start_time < self.config["time_limit"]:

            n_episodes = self.config["n_envs"] * self.config["n_processes"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)}
            )
            agent_info.set(
                "idx_policy",
                torch.randint(self.config["n_policies"], size=(n_episodes,)),
            )

            self.train_batcher.reset(agent_info=agent_info)
            self.train_batcher.execute()

            trajectories, n_env_running = self.train_batcher.get(blocking=True)
            assert n_env_running == 0  # Assert that all trajectories are finished
            n_interactions += trajectories.trajectories.mask().sum().item()
            self.logger.add_scalar(
                "n_interactions_per_seconds",
                n_interactions / (time.time() - _start_time),
                self.iteration,
            )

            # 3) Compute the loss
            dt = self.get_loss(trajectories)
            [
                self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration)
                for k in dt.keys()
            ]

            # 4) Compute the final loss by linear combination of the different individual losses
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]
            ldiscr = self.config["discriminator_coef"] * dt["discriminator_loss"]
            floss = ld - le - lr - ldiscr

            # 5) Update the parameters of the model
            optimizer.zero_grad()
            optimizer_d.zero_grad()
            floss.backward()
            optimizer.step()
            optimizer_d.step()

            # 6) Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())

            # 7) Print some messages
            print(
                "At iteration %d, avg (discounted) reward is %f"
                % (self.iteration, dt["avg_reward"].item())
            )
            print(
                "\t Avg trajectory length is %f"
                % (trajectories.trajectories.lengths.float().mean().item())
            )
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'"
                % self.config["logdir"]
            )
            self.iteration += 1

            # 8)---- Evaluation
            evaluation_trajectories, n_env_running = self.evaluation_batcher.get(
                blocking=False
            )
            if not evaluation_trajectories is None:  # trajectories are available
                assert n_env_running == 0
                # Compute the cumulated reward
                reward = self.compute_reward(evaluation_trajectories)
                cumulated_reward = (
                    (reward * evaluation_trajectories.trajectories.mask()).sum(1).mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward/" + self.config["evaluation_mode"],
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "-- Iteration ",
                    self.iteration,
                    " Evaluation reward = ",
                    cumulated_reward.item(),
                )
                # We reexecute the evaluation batcher to start the acquisition of new trajectories
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                n_episodes = (
                    self.config["n_evaluation_processes"]
                    * self.config["n_evaluation_envs"]
                )
                agent_info = DictTensor(
                    {
                        "stochastic": torch.tensor(
                            [self.config["evaluation_mode"] == "stochastic"]
                        ).repeat(n_episodes)
                    }
                )
                agent_info.set(
                    "idx_policy",
                    torch.randint(self.config["n_policies"], size=(n_episodes,)),
                )

                self.evaluation_batcher.reset(agent_info=agent_info)
                self.evaluation_batcher.execute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()