in rlalgos/reinforce_diayn/reinforce_diayn.py [0:0]
def run(self):
# Creation of one env instance to get the dimensionnality of observations and number of actions
env = self._create_env(
self.config["n_envs"], seed=0, env_name=self.config["env_name"]
)
self.n_actions = env.action_space.n
self.obs_dim = env.reset()[0]["frame"].size()[1]
del env
# Create the agent model
self.learning_model = self._create_model()
self.discriminator = self._create_discriminator()
# Create one agent for loss computation (see get_loss)
self.agent = self._create_agent(
n_actions=self.n_actions, model=self.learning_model
)
# We create a batcher dedicated to evaluation
model = copy.deepcopy(self.learning_model)
self.evaluation_batcher = RL_Batcher(
n_timesteps=self.config["max_episode_steps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"n_envs": self.config["n_evaluation_envs"],
"max_episode_steps": self.config["max_episode_steps"],
"env_name": self.config["env_name"],
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_evaluation_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_evaluation_processes"])
],
agent_info=DictTensor(
{
"stochastic": torch.tensor([True]),
"idx_policy": torch.zeros(1).long(),
}
),
env_info=DictTensor({}),
)
# Create a batcher to sample learning trajectories
model = copy.deepcopy(self.learning_model)
self.train_batcher = RL_Batcher(
n_timesteps=self.config["max_episode_steps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"n_envs": self.config["n_envs"],
"max_episode_steps": self.config["max_episode_steps"],
"env_name": self.config["env_name"],
},
agent_args={"n_actions": self.n_actions, "model": model},
n_processes=self.config["n_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_processes"])
],
agent_info=DictTensor(
{
"stochastic": torch.tensor([True]),
"idx_policy": torch.zeros(1).long(),
}
),
env_info=DictTensor({}),
)
# Creation of the optimizer
optimizer = torch.optim.RMSprop(
self.learning_model.parameters(), lr=self.config["lr"]
)
optimizer_d = torch.optim.RMSprop(
self.discriminator.parameters(), lr=self.config["lr_discriminator"]
)
# Training Loop:
_start_time = time.time()
self.iteration = 0
# We launch the evaluation batcher, such that it starts to collect trajectories with the current model
n_episodes = (
self.config["n_evaluation_processes"] * self.config["n_evaluation_envs"]
)
agent_info = DictTensor(
{
"stochastic": torch.tensor(
[self.config["evaluation_mode"] == "stochastic"]
).repeat(n_episodes)
}
)
agent_info.set(
"idx_policy", torch.randint(self.config["n_policies"], size=(n_episodes,))
)
self.evaluation_batcher.reset(agent_info=agent_info)
self.evaluation_batcher.execute()
self.evaluation_iteration = self.iteration
# Update the batcher with the last version of the learning model
self.train_batcher.update(self.learning_model.state_dict())
n_interactions = 0
while time.time() - _start_time < self.config["time_limit"]:
n_episodes = self.config["n_envs"] * self.config["n_processes"]
agent_info = DictTensor(
{"stochastic": torch.tensor([True]).repeat(n_episodes)}
)
agent_info.set(
"idx_policy",
torch.randint(self.config["n_policies"], size=(n_episodes,)),
)
self.train_batcher.reset(agent_info=agent_info)
self.train_batcher.execute()
trajectories, n_env_running = self.train_batcher.get(blocking=True)
assert n_env_running == 0 # Assert that all trajectories are finished
n_interactions += trajectories.trajectories.mask().sum().item()
self.logger.add_scalar(
"n_interactions_per_seconds",
n_interactions / (time.time() - _start_time),
self.iteration,
)
# 3) Compute the loss
dt = self.get_loss(trajectories)
[
self.logger.add_scalar("loss/" + k, dt[k].item(), self.iteration)
for k in dt.keys()
]
# 4) Compute the final loss by linear combination of the different individual losses
ld = self.config["baseline_coef"] * dt["baseline_loss"]
lr = self.config["reinforce_coef"] * dt["reinforce_loss"]
le = self.config["entropy_coef"] * dt["entropy_loss"]
ldiscr = self.config["discriminator_coef"] * dt["discriminator_loss"]
floss = ld - le - lr - ldiscr
# 5) Update the parameters of the model
optimizer.zero_grad()
optimizer_d.zero_grad()
floss.backward()
optimizer.step()
optimizer_d.step()
# 6) Update the train batcher with the updated model
self.train_batcher.update(self.learning_model.state_dict())
# 7) Print some messages
print(
"At iteration %d, avg (discounted) reward is %f"
% (self.iteration, dt["avg_reward"].item())
)
print(
"\t Avg trajectory length is %f"
% (trajectories.trajectories.lengths.float().mean().item())
)
print(
"\t Curves can be visualized using 'tensorboard --logdir=%s'"
% self.config["logdir"]
)
self.iteration += 1
# 8)---- Evaluation
evaluation_trajectories, n_env_running = self.evaluation_batcher.get(
blocking=False
)
if not evaluation_trajectories is None: # trajectories are available
assert n_env_running == 0
# Compute the cumulated reward
reward = self.compute_reward(evaluation_trajectories)
cumulated_reward = (
(reward * evaluation_trajectories.trajectories.mask()).sum(1).mean()
)
self.logger.add_scalar(
"evaluation_reward/" + self.config["evaluation_mode"],
cumulated_reward.item(),
self.evaluation_iteration,
)
print(
"-- Iteration ",
self.iteration,
" Evaluation reward = ",
cumulated_reward.item(),
)
# We reexecute the evaluation batcher to start the acquisition of new trajectories
self.evaluation_batcher.update(self.learning_model.state_dict())
self.evaluation_iteration = self.iteration
n_episodes = (
self.config["n_evaluation_processes"]
* self.config["n_evaluation_envs"]
)
agent_info = DictTensor(
{
"stochastic": torch.tensor(
[self.config["evaluation_mode"] == "stochastic"]
).repeat(n_episodes)
}
)
agent_info.set(
"idx_policy",
torch.randint(self.config["n_policies"], size=(n_episodes,)),
)
self.evaluation_batcher.reset(agent_info=agent_info)
self.evaluation_batcher.execute()
self.train_batcher.close()
self.evaluation_batcher.get() # To wait for the last trajectories
self.evaluation_batcher.close()
self.logger.update_csv() # To save as a CSV file in logdir
self.logger.close()