def run()

in rlalgos/simple_ddqn/ddqn.py [0:0]


    def run(self):
        env = self._create_env(
            self.config["n_envs"],
            seed=0,
            **{k: self.config[k] for k in self.config if k.startswith("environment/")}
        )
        self.n_actions = env.action_space.n
        self.obs_shape = env.reset()[0]["frame"].size()
        del env

        # Create the agent model
        self.learning_model = self._create_model()
        self.target_model = copy.deepcopy(self.learning_model)

        # Create one agent for loss computation (see get_loss)
        self.agent = self._create_agent(
            n_actions=self.n_actions, model=self.learning_model
        )

        model = copy.deepcopy(self.learning_model)
        self.train_batcher = RL_Batcher(
            n_timesteps=self.config["batch_timesteps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode": "train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{
                    k: self.config[k]
                    for k in self.config
                    if k.startswith("environment/")
                },
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_processes"])
            ],
            agent_info=DictTensor({"epsilon": torch.zeros(1)}),
            env_info=DictTensor({}),
        )

        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = RL_Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode": "evaluation",
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_evaluation_envs"],
                **{
                    k: self.config[k]
                    for k in self.config
                    if k.startswith("environment/")
                },
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_evaluation_processes"],
            seeds=[
                self.config["env_seed"] * 10 + k * 10
                for k in range(self.config["n_evaluation_processes"])
            ],
            agent_info=DictTensor({"epsilon": torch.zeros(1)}),
            env_info=DictTensor({}),
        )

        self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        self.target_model.to(device)
        optimizer = getattr(torch.optim, self.config["optim"])(
            self.learning_model.parameters(), lr=self.config["lr"]
        )

        self.evaluation_batcher.update(
            self._state_dict(self.learning_model, torch.device("cpu"))
        )

        n_episodes = self.config["n_envs"] * self.config["n_processes"]
        agent_info = DictTensor({"epsilon": torch.ones(n_episodes).float()})
        self.train_batcher.reset(agent_info=agent_info)

        logging.info("Sampling initial transitions")
        for k in range(self.config["initial_buffer_epochs"]):
            self.train_batcher.execute()
            trajectories, n = self.train_batcher.get(blocking=True)
            assert not n == 0
            self.replay_buffer.push(trajectories.trajectories)
            print(k, "/", self.config["initial_buffer_epochs"])

        self.iteration = 0

        n_episodes = (
            self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
        )
        self.evaluation_batcher.reset(
            agent_info=DictTensor({"epsilon": torch.zeros(n_episodes).float()})
        )
        # self.evaluation_batcher.reset(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}))
        self.evaluation_batcher.execute()

        logging.info("Starting Learning")
        _start_time = time.time()

        self.target_model.load_state_dict(self.learning_model.state_dict())
        cumulated_reward = torch.zeros(
            self.config["n_envs"] * self.config["n_processes"]
        )

        while time.time() - _start_time < self.config["time_limit"]:
            n_episodes = self.config["n_envs"] * self.config["n_processes"]
            self.train_batcher.update(
                self._state_dict(self.learning_model, torch.device("cpu"))
            )
            self.train_batcher.execute(
                agent_info=DictTensor(
                    {
                        "epsilon": torch.tensor([self.config["epsilon_greedy"]])
                        .repeat(n_episodes)
                        .float()
                    }
                )
            )
            trajectories, n = self.train_batcher.get(blocking=True)

            assert n == self.config["n_envs"] * self.config["n_processes"]
            self.replay_buffer.push(trajectories.trajectories)
            self.logger.add_scalar(
                "stats/replay_buffer_size", self.replay_buffer.size(), self.iteration
            )

            # avg_reward = 0
            assert self.config["qvalue_epochs"] > 0
            for k in range(self.config["qvalue_epochs"]):
                optimizer.zero_grad()
                transitions = self.replay_buffer.sample(n=self.config["n_batches"])
                dt = self.get_loss(transitions, device)
                _loss = (dt["q_loss"].to(self.config["learner_device"])).mean()
                self.logger.add_scalar("q_loss", _loss.item(), self.iteration)

                _loss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                self.iteration += 1
                optimizer.step()

                tau = self.config["update_target_tau"]
                self.soft_update_params(self.learning_model, self.target_model, tau)

                if time.time() - _start_time > 600 and self.iteration % 1000 == 0:
                    self.logger.update_csv()

            self.evaluate()
        self.logger.update_csv()  # To save as a CSV file in logdir

        trajectories, n = self.train_batcher.get()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.close()