def run()

in rlalgos/sac/sac.py [0:0]


    def run(self):
        self.q1 = self._create_q()
        self.q2 = self._create_q()
        self.target_q1 = self._create_q()
        self.target_q2 = self._create_q()
        self.target_q1.load_state_dict(self.q1.state_dict())
        self.target_q2.load_state_dict(self.q2.state_dict())

        self.learning_model = self._create_model()

        model = copy.deepcopy(self.learning_model)
        self.evaluation_batcher = RL_Batcher(
            n_timesteps=self.config["max_episode_steps"],
            create_agent=self._create_agent,
            create_env=self._create_env,
            env_args={
                "n_envs": self.config["n_evaluation_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_processes=self.config["n_evaluation_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_evaluation_processes"])
            ],
            agent_info=DictTensor({"stochastic": torch.tensor([True])}),
            env_info=DictTensor({}),
        )

        model = copy.deepcopy(self.learning_model)
        self.train_batcher = RL_Batcher(
            n_timesteps=self.config["batch_timesteps"],
            create_agent=self._create_agent,
            create_env=self._create_train_env,
            env_args={
                "n_envs": self.config["n_envs"],
                "max_episode_steps": self.config["max_episode_steps"],
                "env_name": self.config["env_name"],
            },
            agent_args={"action_dim": self.action_dim, "policy": model},
            n_processes=self.config["n_processes"],
            seeds=[
                self.config["env_seed"] + k * 10
                for k in range(self.config["n_processes"])
            ],
            agent_info=DictTensor({"stochastic": torch.tensor([True])}),
            env_info=DictTensor({}),
        )

        self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
        device = torch.device(self.config["learner_device"])
        self.learning_model.to(device)

        self.q1.to(device)
        self.q2.to(device)
        self.target_q1.to(device)
        self.target_q2.to(device)
        optimizer = torch.optim.Adam(
            self.learning_model.parameters(), lr=self.config["lr"]
        )
        optimizer_q1 = torch.optim.Adam(self.q1.parameters(), lr=self.config["lr"])
        optimizer_q2 = torch.optim.Adam(self.q2.parameters(), lr=self.config["lr"])

        self.train_batcher.update(
            self._state_dict(self.learning_model, torch.device("cpu"))
        )
        self.evaluation_batcher.update(
            self._state_dict(self.learning_model, torch.device("cpu"))
        )

        n_episodes = self.config["n_envs"] * self.config["n_processes"]
        self.train_batcher.reset(
            agent_info=DictTensor({"stochastic": torch.zeros(n_episodes).eq(0.0)})
        )
        logging.info("Sampling initial transitions")
        n_iterations = int(
            self.config["n_starting_transitions"]
            / (n_episodes * self.config["batch_timesteps"])
        )
        for k in range(n_iterations):
            self.train_batcher.execute()
            trajectories, n = self.train_batcher.get()
            self.replay_buffer.push(trajectories)
        print("replay_buffer_size = ", self.replay_buffer.size())

        n_episodes = (
            self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
        )
        stochastic = torch.tensor(
            [self.config["evaluation_mode"] == "stochastic"]
        ).repeat(n_episodes)
        self.evaluation_batcher.reset(agent_info=DictTensor({"stochastic": stochastic}))
        self.evaluation_batcher.execute()
        self.evaluation_iteration = 0

        logging.info("Starting Learning")
        _start_time = time.time()

        logging.info("Learning")
        self.iteration = 0
        while time.time() - _start_time < self.config["time_limit"]:
            self.train_batcher.execute()
            trajectories, n = self.train_batcher.get()
            self.replay_buffer.push(trajectories)
            self.logger.add_scalar(
                "replay_buffer_size", self.replay_buffer.size(), self.iteration
            )
            # avg_reward = 0

            for k in range(self.config["n_batches_per_epochs"]):
                transitions = self.replay_buffer.sample(n=self.config["size_batches"])

                # print(dt)
                dt, transitions = self.get_q_loss(transitions, device)
                [
                    self.logger.add_scalar(k, dt[k].item(), self.iteration)
                    for k in dt.keys()
                ]
                optimizer_q1.zero_grad()
                dt["q1_loss"].backward()
                optimizer_q1.step()

                optimizer_q2.zero_grad()
                dt["q2_loss"].backward()
                optimizer_q2.step()

                optimizer.zero_grad()
                dt = self.get_policy_loss(transitions)
                [
                    self.logger.add_scalar(k, dt[k].item(), self.iteration)
                    for k in dt.keys()
                ]
                dt["policy_loss"].backward()
                optimizer.step()

                tau = self.config["tau"]
                self.soft_update_params(self.q1, self.target_q1, tau)
                self.soft_update_params(self.q2, self.target_q2, tau)

                self.iteration += 1

            self.train_batcher.update(
                self._state_dict(self.learning_model, torch.device("cpu"))
            )

            evaluation_trajectories, n = self.evaluation_batcher.get(blocking=False)
            if not evaluation_trajectories is None:  # trajectories are available
                # Compute the cumulated reward
                cumulated_reward = (
                    (
                        evaluation_trajectories.trajectories["_observation/reward"]
                        * evaluation_trajectories.trajectories.mask()
                    )
                    .sum(1)
                    .mean()
                )
                self.logger.add_scalar(
                    "evaluation_reward",
                    cumulated_reward.item(),
                    self.evaluation_iteration,
                )
                print(
                    "At iteration %d, reward is %f"
                    % (self.evaluation_iteration, cumulated_reward.item())
                )
                # We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
                self.evaluation_batcher.update(self.learning_model.state_dict())
                self.evaluation_iteration = self.iteration
                n_episodes = (
                    self.config["n_evaluation_processes"]
                    * self.config["n_evaluation_envs"]
                )
                agent_info = DictTensor(
                    {"stochastic": torch.tensor([False]).repeat(n_episodes)}
                )
                self.evaluation_batcher.reset(agent_info=agent_info)
                self.evaluation_batcher.execute()