in rlalgos/simple_ddqn/ddqn.py [0:0]
def run(self):
env = self._create_env(
self.config["n_envs"],
seed=0,
**{k: self.config[k] for k in self.config if k.startswith("environment/")}
)
self.n_actions = env.action_space.n
self.obs_shape = env.reset()[0]["frame"].size()
del env
# Create the agent model
self.learning_model = self._create_model()
self.target_model = copy.deepcopy(self.learning_model)
# Create one agent for loss computation (see get_loss)
self.agent = self._create_agent(
n_actions=self.n_actions, model=self.learning_model
)
model = copy.deepcopy(self.learning_model)
self.train_batcher = RL_Batcher(
n_timesteps=self.config["batch_timesteps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"mode": "train",
"n_envs": self.config["n_envs"],
"max_episode_steps": self.config["max_episode_steps"],
**{
k: self.config[k]
for k in self.config
if k.startswith("environment/")
},
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_processes"])
],
agent_info=DictTensor({"epsilon": torch.zeros(1)}),
env_info=DictTensor({}),
)
model = copy.deepcopy(self.learning_model)
self.evaluation_batcher = RL_Batcher(
n_timesteps=self.config["max_episode_steps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"mode": "evaluation",
"max_episode_steps": self.config["max_episode_steps"],
"n_envs": self.config["n_evaluation_envs"],
**{
k: self.config[k]
for k in self.config
if k.startswith("environment/")
},
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_evaluation_processes"],
seeds=[
self.config["env_seed"] * 10 + k * 10
for k in range(self.config["n_evaluation_processes"])
],
agent_info=DictTensor({"epsilon": torch.zeros(1)}),
env_info=DictTensor({}),
)
self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
device = torch.device(self.config["learner_device"])
self.learning_model.to(device)
self.target_model.to(device)
optimizer = getattr(torch.optim, self.config["optim"])(
self.learning_model.parameters(), lr=self.config["lr"]
)
self.evaluation_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
n_episodes = self.config["n_envs"] * self.config["n_processes"]
agent_info = DictTensor({"epsilon": torch.ones(n_episodes).float()})
self.train_batcher.reset(agent_info=agent_info)
logging.info("Sampling initial transitions")
for k in range(self.config["initial_buffer_epochs"]):
self.train_batcher.execute()
trajectories, n = self.train_batcher.get(blocking=True)
assert not n == 0
self.replay_buffer.push(trajectories.trajectories)
print(k, "/", self.config["initial_buffer_epochs"])
self.iteration = 0
n_episodes = (
self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
)
self.evaluation_batcher.reset(
agent_info=DictTensor({"epsilon": torch.zeros(n_episodes).float()})
)
# self.evaluation_batcher.reset(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}))
self.evaluation_batcher.execute()
logging.info("Starting Learning")
_start_time = time.time()
self.target_model.load_state_dict(self.learning_model.state_dict())
cumulated_reward = torch.zeros(
self.config["n_envs"] * self.config["n_processes"]
)
while time.time() - _start_time < self.config["time_limit"]:
n_episodes = self.config["n_envs"] * self.config["n_processes"]
self.train_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
self.train_batcher.execute(
agent_info=DictTensor(
{
"epsilon": torch.tensor([self.config["epsilon_greedy"]])
.repeat(n_episodes)
.float()
}
)
)
trajectories, n = self.train_batcher.get(blocking=True)
assert n == self.config["n_envs"] * self.config["n_processes"]
self.replay_buffer.push(trajectories.trajectories)
self.logger.add_scalar(
"stats/replay_buffer_size", self.replay_buffer.size(), self.iteration
)
# avg_reward = 0
assert self.config["qvalue_epochs"] > 0
for k in range(self.config["qvalue_epochs"]):
optimizer.zero_grad()
transitions = self.replay_buffer.sample(n=self.config["n_batches"])
dt = self.get_loss(transitions, device)
_loss = (dt["q_loss"].to(self.config["learner_device"])).mean()
self.logger.add_scalar("q_loss", _loss.item(), self.iteration)
_loss.backward()
if self.config["clip_grad"] > 0:
n = torch.nn.utils.clip_grad_norm_(
self.learning_model.parameters(), self.config["clip_grad"]
)
self.logger.add_scalar("grad_norm", n.item(), self.iteration)
self.iteration += 1
optimizer.step()
tau = self.config["update_target_tau"]
self.soft_update_params(self.learning_model, self.target_model, tau)
if time.time() - _start_time > 600 and self.iteration % 1000 == 0:
self.logger.update_csv()
self.evaluate()
self.logger.update_csv() # To save as a CSV file in logdir
trajectories, n = self.train_batcher.get()
self.train_batcher.close()
self.evaluation_batcher.get() # To wait for the last trajectories
self.evaluation_batcher.close()
self.logger.close()