in rlalgos/dqn/duelling_dqn.py [0:0]
def run(self):
env = self._create_env(
self.config["n_envs"],
seed=0,
**{k: self.config[k] for k in self.config if k.startswith("environment/")}
)
self.n_actions = env.action_space.n
self.obs_shape = env.reset()[0]["frame"].size()
del env
# Create the agent model
self.learning_model = self._create_model()
self.target_model = copy.deepcopy(self.learning_model)
# Create one agent for loss computation (see get_loss)
self.agent = self._create_agent(
n_actions=self.n_actions, model=self.learning_model
)
model = copy.deepcopy(self.learning_model)
self.train_batcher = RL_Batcher(
n_timesteps=self.config["batch_timesteps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"mode": "train",
"n_envs": self.config["n_envs"],
"max_episode_steps": self.config["max_episode_steps"],
**{
k: self.config[k]
for k in self.config
if k.startswith("environment/")
},
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_processes"])
],
agent_info=DictTensor({"epsilon": torch.zeros(1)}),
env_info=DictTensor({}),
)
model = copy.deepcopy(self.learning_model)
self.evaluation_batcher = RL_Batcher(
n_timesteps=self.config["max_episode_steps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"mode": "evaluation",
"max_episode_steps": self.config["max_episode_steps"],
"n_envs": self.config["n_evaluation_envs"],
**{
k: self.config[k]
for k in self.config
if k.startswith("environment/")
},
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_evaluation_processes"],
seeds=[
self.config["env_seed"] * 10 + k * 10
for k in range(self.config["n_evaluation_processes"])
],
agent_info=DictTensor({"epsilon": torch.zeros(1)}),
env_info=DictTensor({}),
)
self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
device = torch.device(self.config["learner_device"])
self.learning_model.to(device)
self.target_model.to(device)
optimizer = getattr(torch.optim, self.config["optim"])(
self.learning_model.parameters(), lr=self.config["lr"]
)
self.evaluation_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
n_episodes = self.config["n_envs"] * self.config["n_processes"]
agent_info = DictTensor({"epsilon": torch.ones(n_episodes).float()})
self.train_batcher.reset(agent_info=agent_info)
logging.info("Sampling initial transitions")
for k in range(self.config["initial_buffer_epochs"]):
self.train_batcher.execute()
trajectories, n = self.train_batcher.get(blocking=True)
assert not n == 0
self.replay_buffer.push(trajectories.trajectories)
self.iteration = 0
n_episodes = (
self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
)
self.evaluation_batcher.reset(
agent_info=DictTensor({"epsilon": torch.zeros(n_episodes).float()})
)
# self.evaluation_batcher.reset(agent_info=DictTensor({"epsilon":torch.zeros(n_episodes)}))
self.evaluation_batcher.execute()
logging.info("Starting Learning")
_start_time = time.time()
produced = 0
consumed = 0
n_interactions = self.replay_buffer.size()
self.target_model.load_state_dict(self.learning_model.state_dict())
cumulated_reward = torch.zeros(
self.config["n_envs"] * self.config["n_processes"]
)
epsilon_step = (
self.config["epsilon_greedy_max"] - self.config["epsilon_greedy_min"]
) / self.config["epsilon_min_epoch"]
self.epsilon = self.config["epsilon_greedy_max"] - epsilon_step * self.iteration
self.epsilon = max(self.epsilon, self.config["epsilon_greedy_min"])
self.logger.add_scalar("epsilon", self.epsilon, self.iteration)
n_episodes = self.config["n_envs"] * self.config["n_processes"]
self.train_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
self.train_batcher.execute(
agent_info=DictTensor(
{"epsilon": torch.tensor([self.epsilon]).repeat(n_episodes).float()}
)
)
print("Go learning...")
while time.time() - _start_time < self.config["time_limit"]:
trajectories, n = self.train_batcher.get(
blocking=not self.config["as_fast_as_possible"]
)
if not trajectories is None:
epsilon_step = (
self.config["epsilon_greedy_max"]
- self.config["epsilon_greedy_min"]
) / self.config["epsilon_min_epoch"]
self.epsilon = (
self.config["epsilon_greedy_max"] - epsilon_step * self.iteration
)
self.epsilon = max(self.epsilon, self.config["epsilon_greedy_min"])
self.logger.add_scalar("epsilon", self.epsilon, self.iteration)
n_episodes = self.config["n_envs"] * self.config["n_processes"]
self.train_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
self.train_batcher.execute(
agent_info=DictTensor(
{
"epsilon": torch.tensor([self.epsilon])
.repeat(n_episodes)
.float()
}
)
)
reward = trajectories.trajectories["_observation/reward"]
_is = trajectories.trajectories["observation/initial_state"]
crs = []
for t in range(reward.size(1)):
cr = cumulated_reward[_is[:, t]]
for ii in range(cr.size()[0]):
# print("CR = ",cr[ii].item())
crs.append(cr[ii].item())
cumulated_reward = (
torch.zeros_like(cumulated_reward) * _is[:, t].float()
+ (1 - _is[:, t].float()) * cumulated_reward
)
cumulated_reward += reward[:, t]
if len(crs) > 0:
self.logger.add_scalar(
"train_cumulated_reward", np.mean(crs), self.iteration
)
assert n == self.config["n_envs"] * self.config["n_processes"]
self.replay_buffer.push(trajectories.trajectories)
produced += trajectories.trajectories.lengths.sum().item()
self.logger.add_scalar(
"stats/replay_buffer_size",
self.replay_buffer.size(),
self.iteration,
)
# avg_reward = 0
assert self.config["qvalue_epochs"] > 0
for k in range(self.config["qvalue_epochs"]):
optimizer.zero_grad()
alpha = self.config["buffer/alpha"]
beta = self.config["buffer/beta"]
transitions, idx, weights = self.replay_buffer.sample(
n=self.config["n_batches"], alpha=alpha, beta=beta
)
consumed += transitions.n_elems()
dt = self.get_loss(transitions, device)
_loss = None
if alpha == 0 and beta == 0:
_loss = dt["q_loss"].to(self.config["learner_device"]).mean()
else:
self.replay_buffer.update_priorities(
idx, dt["q_loss"].sqrt().detach().to("cpu")
)
_loss = (
dt["q_loss"] * weights.to(self.config["learner_device"])
).mean()
self.logger.add_scalar("q_loss", _loss.item(), self.iteration)
_loss.backward()
if self.config["clip_grad"] > 0:
n = torch.nn.utils.clip_grad_norm_(
self.learning_model.parameters(), self.config["clip_grad"]
)
self.logger.add_scalar("grad_norm", n.item(), self.iteration)
self.iteration += 1
optimizer.step()
if self.config["update_target_hard"]:
if self.iteration % self.config["update_target_epoch"] == 0:
self.target_model.load_state_dict(
self.learning_model.state_dict()
)
else:
tau = self.config["update_target_tau"]
self.soft_update_params(self.learning_model, self.target_model, tau)
if time.time() - _start_time > 600 and self.iteration % 1000 == 0:
self.logger.update_csv()
tt = time.time()
c_ps = consumed / (tt - _start_time)
p_ps = produced / (tt - _start_time)
# print(p_ps,c_ps)
self.logger.add_scalar("speed/consumed_per_seconds", c_ps, self.iteration)
self.logger.add_scalar(
"speed/n_interactions", n_interactions + produced, self.iteration
)
self.logger.add_scalar("speed/produced_per_seconds", p_ps, self.iteration)
self.evaluate()
self.logger.update_csv() # To save as a CSV file in logdir
trajectories, n = self.train_batcher.get()
self.train_batcher.close()
self.evaluation_batcher.get() # To wait for the last trajectories
self.evaluation_batcher.close()
self.logger.close()