def run()

in rlalgos/dqn/duelling_dqn.py [0:0]


    def run(self):
        env = self._create_env(
            self.config["n_envs"],
            seed=0,
            **{k: self.config[k] for k in self.config if k.startswith("environment/")}
        )
        self.n_actions = env.action_space.n
        self.obs_shape = env.reset()[0]["frame"].size()
        del env

        # Create the agent model
        self.learning_model = self._create_model()
        self.target_model = copy.deepcopy(self.learning_model)

        # Create one agent for loss computation (see get_loss)
        self.agent = self._create_agent(
            n_actions=self.n_actions, model=self.learning_model
        )

        model = copy.deepcopy(self.learning_model)
        self.train_batcher = RL_Batcher(
            n_timesteps=self.config["batch_timesteps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode": "train",
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                **{
                    k: self.config[k]
                    for k in self.config
                    if k.startswith("environment/")
                },
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_processes"])
            ],
            agent_info=DictTensor({"epsilon": torch.zeros(1)}),
            env_info=DictTensor({}),
        )

        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = RL_Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "mode": "evaluation",
                "max_episode_steps": self.config["max_episode_steps"],
                "n_envs": self.config["n_evaluation_envs"],
                **{
                    k: self.config[k]
                    for k in self.config
                    if k.startswith("environment/")
                },
            },
            agent_args={"n_actions": self.n_actions, "model": model},
            n_processes=self.config["n_evaluation_processes"],
            seeds=[
                self.config["env_seed"] * 10 + k * 10
                for k in range(self.config["n_evaluation_processes"])
            ],
            agent_info=DictTensor({"epsilon": torch.zeros(1)}),
            env_info=DictTensor({}),
        )

        self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)
        self.target_model.to(device)
        optimizer = getattr(torch.optim, self.config["optim"])(
            self.learning_model.parameters(), lr=self.config["lr"]
        )

        self.evaluation_batcher.update(
            self._state_dict(self.learning_model, torch.device("cpu"))
        )

        n_episodes = self.config["n_envs"] * self.config["n_processes"]
        agent_info = DictTensor({"epsilon": torch.ones(n_episodes).float()})
        self.train_batcher.reset(agent_info=agent_info)

        logging.info("Sampling initial transitions")
        for k in range(self.config["initial_buffer_epochs"]):
            self.train_batcher.execute()
            trajectories, n = self.train_batcher.get(blocking=True)
            assert not n == 0
            self.replay_buffer.push(trajectories.trajectories)

        self.iteration = 0

        n_episodes = (
            self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
        )
        self.evaluation_batcher.reset(
            agent_info=DictTensor({"epsilon": torch.zeros(n_episodes).float()})
        )
        # self.evaluation_batcher.reset(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}))
        self.evaluation_batcher.execute()

        logging.info("Starting Learning")
        _start_time = time.time()

        produced = 0
        consumed = 0
        n_interactions = self.replay_buffer.size()
        self.target_model.load_state_dict(self.learning_model.state_dict())
        cumulated_reward = torch.zeros(
            self.config["n_envs"] * self.config["n_processes"]
        )

        epsilon_step = (
            self.config["epsilon_greedy_max"] - self.config["epsilon_greedy_min"]
        ) / self.config["epsilon_min_epoch"]
        self.epsilon = self.config["epsilon_greedy_max"] - epsilon_step * self.iteration
        self.epsilon = max(self.epsilon, self.config["epsilon_greedy_min"])
        self.logger.add_scalar("epsilon", self.epsilon, self.iteration)
        n_episodes = self.config["n_envs"] * self.config["n_processes"]
        self.train_batcher.update(
            self._state_dict(self.learning_model, torch.device("cpu"))
        )
        self.train_batcher.execute(
            agent_info=DictTensor(
                {"epsilon": torch.tensor([self.epsilon]).repeat(n_episodes).float()}
            )
        )
        print("Go learning...")
        while time.time() - _start_time < self.config["time_limit"]:
            trajectories, n = self.train_batcher.get(
                blocking=not self.config["as_fast_as_possible"]
            )

            if not trajectories is None:
                epsilon_step = (
                    self.config["epsilon_greedy_max"]
                    - self.config["epsilon_greedy_min"]
                ) / self.config["epsilon_min_epoch"]
                self.epsilon = (
                    self.config["epsilon_greedy_max"] - epsilon_step * self.iteration
                )
                self.epsilon = max(self.epsilon, self.config["epsilon_greedy_min"])

                self.logger.add_scalar("epsilon", self.epsilon, self.iteration)
                n_episodes = self.config["n_envs"] * self.config["n_processes"]
                self.train_batcher.update(
                    self._state_dict(self.learning_model, torch.device("cpu"))
                )
                self.train_batcher.execute(
                    agent_info=DictTensor(
                        {
                            "epsilon": torch.tensor([self.epsilon])
                            .repeat(n_episodes)
                            .float()
                        }
                    )
                )

                reward = trajectories.trajectories["_observation/reward"]
                _is = trajectories.trajectories["observation/initial_state"]
                crs = []
                for t in range(reward.size(1)):
                    cr = cumulated_reward[_is[:, t]]
                    for ii in range(cr.size()[0]):
                        # print("CR = ",cr[ii].item())
                        crs.append(cr[ii].item())
                    cumulated_reward = (
                        torch.zeros_like(cumulated_reward) * _is[:, t].float()
                        + (1 - _is[:, t].float()) * cumulated_reward
                    )
                    cumulated_reward += reward[:, t]
                if len(crs) > 0:
                    self.logger.add_scalar(
                        "train_cumulated_reward", np.mean(crs), self.iteration
                    )

                assert n == self.config["n_envs"] * self.config["n_processes"]
                self.replay_buffer.push(trajectories.trajectories)
                produced += trajectories.trajectories.lengths.sum().item()
                self.logger.add_scalar(
                    "stats/replay_buffer_size",
                    self.replay_buffer.size(),
                    self.iteration,
                )

            # avg_reward = 0
            assert self.config["qvalue_epochs"] > 0
            for k in range(self.config["qvalue_epochs"]):
                optimizer.zero_grad()
                alpha = self.config["buffer/alpha"]
                beta = self.config["buffer/beta"]
                transitions, idx, weights = self.replay_buffer.sample(
                    n=self.config["n_batches"], alpha=alpha, beta=beta
                )
                consumed += transitions.n_elems()
                dt = self.get_loss(transitions, device)
                _loss = None

                if alpha == 0 and beta == 0:
                    _loss = dt["q_loss"].to(self.config["learner_device"]).mean()
                else:
                    self.replay_buffer.update_priorities(
                        idx, dt["q_loss"].sqrt().detach().to("cpu")
                    )
                    _loss = (
                        dt["q_loss"] * weights.to(self.config["learner_device"])
                    ).mean()

                self.logger.add_scalar("q_loss", _loss.item(), self.iteration)

                _loss.backward()
                if self.config["clip_grad"] > 0:
                    n = torch.nn.utils.clip_grad_norm_(
                        self.learning_model.parameters(), self.config["clip_grad"]
                    )
                    self.logger.add_scalar("grad_norm", n.item(), self.iteration)
                self.iteration += 1
                optimizer.step()

                if self.config["update_target_hard"]:
                    if self.iteration % self.config["update_target_epoch"] == 0:
                        self.target_model.load_state_dict(
                            self.learning_model.state_dict()
                        )
                else:
                    tau = self.config["update_target_tau"]
                    self.soft_update_params(self.learning_model, self.target_model, tau)

                if time.time() - _start_time > 600 and self.iteration % 1000 == 0:
                    self.logger.update_csv()

            tt = time.time()
            c_ps = consumed / (tt - _start_time)
            p_ps = produced / (tt - _start_time)
            # print(p_ps,c_ps)
            self.logger.add_scalar("speed/consumed_per_seconds", c_ps, self.iteration)
            self.logger.add_scalar(
                "speed/n_interactions", n_interactions + produced, self.iteration
            )
            self.logger.add_scalar("speed/produced_per_seconds", p_ps, self.iteration)
            self.evaluate()
        self.logger.update_csv()  # To save as a CSV file in logdir

        trajectories, n = self.train_batcher.get()

        self.train_batcher.close()
        self.evaluation_batcher.get()  # To wait for the last trajectories
        self.evaluation_batcher.close()
        self.logger.close()