def run()

in tutorial/deprecated/tutorial_reinforce_with_evaluation_s/reinforce.py [0:0]


    def run(self):
        # Instantiate the learning model abd the baseline model
        action_model = ActionModel(self.obs_dim, self.n_actions, 16)
        baseline_model = BaselineModel(self.obs_dim, 16)
        self.learning_model = Model(action_model, baseline_model)
        self.agent = self._create_agent(
            n_actions=self.n_actions, model=self.learning_model
        )

        # We create a batcher dedicated to evaluation
        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_evaluation_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_evaluation_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_threads"])
            ],
            agent_info=DictTensor({"stochastic": torch.tensor([True])}),
            env_info=DictTensor({}),
        )

        # Creation of the batcher for sampling complete episodes (i.e Episode Batcher)
        # The batcher will sample n_threads*n_envs trajectories at each call
        # To have a fast batcher, we have to configure it with n_timesteps=self.config["max_episode_steps"]
        model = copy.deepcopy(self.learning_model)
        self.train_batcher = Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_threads=self.config["n_threads"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_threads"])
            ],
            agent_info=DictTensor({"stochastic": torch.tensor([True])}),
            env_info=DictTensor({}),
        )

        # Creation of the optimizer
        optimizer = torch.optim.RMSprop(
            self.learning_model.parameters(), lr=self.config["lr"]
        )

        # Training Loop:
        _start_time = time.time()
        self.iteration = 0

        # We launch the evaluation batcher (in deterministic mode)
        n_episodes = (
            self.config["n_evaluation_threads"] * self.config["n_evaluation_envs"]
        )
        agent_info = DictTensor(
            {"stochastic": torch.tensor([False]).repeat(n_episodes)}
        )
        self.evaluation_batcher.reset(agent_info=agent_info)
        self.evaluation_batcher.execute()
        self.evaluation_iteration = self.iteration

        while time.time() - _start_time < self.config["time_limit"]:
            # Update the batcher with the last version of the learning model
            self.train_batcher.update(self.learning_model.state_dict())

            # Call the batcher to get a sample of trajectories
            # 1) The policy will be executed in "stochastic' mode
            n_episodes = self.config["n_envs"] * self.config["n_threads"]
            agent_info = DictTensor(
                {"stochastic": torch.tensor([True]).repeat(n_episodes)}
            )
            self.train_batcher.reset(agent_info=agent_info)
            self.train_batcher.execute()

            # 2) We get the trajectories (and wait until the trajectories have been sampled)
            trajectories, n_env_running = self.train_batcher.get(blocking=True)
            assert n_env_running == 0

            # 3) Now, we compute the loss
            dt = self.get_loss(trajectories)
            [self.logger.add_scalar(k, dt[k].item(), self.iteration) for k in dt.keys()]

            # Computation of final loss
            ld = self.config["baseline_coef"] * dt["baseline_loss"]
            lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
            le = self.config["entropy_coef"] * dt["entropy_loss"]

            floss = ld - le - lr

            optimizer.zero_grad()
            floss.backward()
            optimizer.step()

            # Update the train batcher with the updated model
            self.train_batcher.update(self.learning_model.state_dict())
            print(
                "At iteration %d, avg (discounted) reward is %f"
                % (self.iteration, dt["avg_reward"].item())
            )
            print(
                "\t Avg trajectory length is %f"
                % (trajectories.trajectories.lengths.float().mean().item())
            )
            print(
                "\t Curves can be visualized using 'tensorboard --logdir=%s'"
                % self.config["logdir"]
            )
            self.iteration += 1

            # We check the evaluation batcher
            evaluation_trajectories, n_env_running = self.evaluation_batcher.get(
                blocking=False
            )
            if not evaluation_trajectories is None:  # trajectories are available
                assert n_env_running == 0
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories.trajectories["_observation/reward"]
                        * evaluation_trajectories.trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "-- Iteration ",
                    self.iteration,
                    " Evaluation reward = ",
                    cumulated_reward.item(),
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                n_episodes = (
                    self.config["n_evaluation_threads"]
                    * self.config["n_evaluation_envs"]
                )
                agent_info = DictTensor(
                    {"stochastic": torch.tensor([False]).repeat(n_episodes)}
                )
                self.evaluation_batcher.reset(agent_info=agent_info)
                self.evaluation_batcher.execute()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.update_csv()  # To save as a CSV file in logdir
        self.logger.close()