in rlalgos/sac/sac.py [0:0]
def run(self):
self.q1 = self._create_q()
self.q2 = self._create_q()
self.target_q1 = self._create_q()
self.target_q2 = self._create_q()
self.target_q1.load_state_dict(self.q1.state_dict())
self.target_q2.load_state_dict(self.q2.state_dict())
self.learning_model = self._create_model()
model = copy.deepcopy(self.learning_model)
self.evaluation_batcher = RL_Batcher(
n_timesteps=self.config["max_episode_steps"],
create_agent=self._create_agent,
create_env=self._create_env,
env_args={
"n_envs": self.config["n_evaluation_envs"],
"max_episode_steps": self.config["max_episode_steps"],
"env_name": self.config["env_name"],
},
agent_args={"action_dim": self.action_dim, "policy": model},
n_processes=self.config["n_evaluation_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_evaluation_processes"])
],
agent_info=DictTensor({"stochastic": torch.tensor([True])}),
env_info=DictTensor({}),
)
model = copy.deepcopy(self.learning_model)
self.train_batcher = RL_Batcher(
n_timesteps=self.config["batch_timesteps"],
create_agent=self._create_agent,
create_env=self._create_train_env,
env_args={
"n_envs": self.config["n_envs"],
"max_episode_steps": self.config["max_episode_steps"],
"env_name": self.config["env_name"],
},
agent_args={"action_dim": self.action_dim, "policy": model},
n_processes=self.config["n_processes"],
seeds=[
self.config["env_seed"] + k * 10
for k in range(self.config["n_processes"])
],
agent_info=DictTensor({"stochastic": torch.tensor([True])}),
env_info=DictTensor({}),
)
self.replay_buffer = ReplayBuffer(self.config["replay_buffer_size"])
device = torch.device(self.config["learner_device"])
self.learning_model.to(device)
self.q1.to(device)
self.q2.to(device)
self.target_q1.to(device)
self.target_q2.to(device)
optimizer = torch.optim.Adam(
self.learning_model.parameters(), lr=self.config["lr"]
)
optimizer_q1 = torch.optim.Adam(self.q1.parameters(), lr=self.config["lr"])
optimizer_q2 = torch.optim.Adam(self.q2.parameters(), lr=self.config["lr"])
self.train_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
self.evaluation_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
n_episodes = self.config["n_envs"] * self.config["n_processes"]
self.train_batcher.reset(
agent_info=DictTensor({"stochastic": torch.zeros(n_episodes).eq(0.0)})
)
logging.info("Sampling initial transitions")
n_iterations = int(
self.config["n_starting_transitions"]
/ (n_episodes * self.config["batch_timesteps"])
)
for k in range(n_iterations):
self.train_batcher.execute()
trajectories, n = self.train_batcher.get()
self.replay_buffer.push(trajectories)
print("replay_buffer_size = ", self.replay_buffer.size())
n_episodes = (
self.config["n_evaluation_envs"] * self.config["n_evaluation_processes"]
)
stochastic = torch.tensor(
[self.config["evaluation_mode"] == "stochastic"]
).repeat(n_episodes)
self.evaluation_batcher.reset(agent_info=DictTensor({"stochastic": stochastic}))
self.evaluation_batcher.execute()
self.evaluation_iteration = 0
logging.info("Starting Learning")
_start_time = time.time()
logging.info("Learning")
self.iteration = 0
while time.time() - _start_time < self.config["time_limit"]:
self.train_batcher.execute()
trajectories, n = self.train_batcher.get()
self.replay_buffer.push(trajectories)
self.logger.add_scalar(
"replay_buffer_size", self.replay_buffer.size(), self.iteration
)
# avg_reward = 0
for k in range(self.config["n_batches_per_epochs"]):
transitions = self.replay_buffer.sample(n=self.config["size_batches"])
# print(dt)
dt, transitions = self.get_q_loss(transitions, device)
[
self.logger.add_scalar(k, dt[k].item(), self.iteration)
for k in dt.keys()
]
optimizer_q1.zero_grad()
dt["q1_loss"].backward()
optimizer_q1.step()
optimizer_q2.zero_grad()
dt["q2_loss"].backward()
optimizer_q2.step()
optimizer.zero_grad()
dt = self.get_policy_loss(transitions)
[
self.logger.add_scalar(k, dt[k].item(), self.iteration)
for k in dt.keys()
]
dt["policy_loss"].backward()
optimizer.step()
tau = self.config["tau"]
self.soft_update_params(self.q1, self.target_q1, tau)
self.soft_update_params(self.q2, self.target_q2, tau)
self.iteration += 1
self.train_batcher.update(
self._state_dict(self.learning_model, torch.device("cpu"))
)
evaluation_trajectories, n = self.evaluation_batcher.get(blocking=False)
if not evaluation_trajectories is None: # trajectories are available
# Compute the cumulated reward
cumulated_reward = (
(
evaluation_trajectories.trajectories["_observation/reward"]
* evaluation_trajectories.trajectories.mask()
)
.sum(1)
.mean()
)
self.logger.add_scalar(
"evaluation_reward",
cumulated_reward.item(),
self.evaluation_iteration,
)
print(
"At iteration %d, reward is %f"
% (self.evaluation_iteration, cumulated_reward.item())
)
# We reexecute the evaluation batcher (with same value of agent_info and same number of episodes)
self.evaluation_batcher.update(self.learning_model.state_dict())
self.evaluation_iteration = self.iteration
n_episodes = (
self.config["n_evaluation_processes"]
* self.config["n_evaluation_envs"]
)
agent_info = DictTensor(
{"stochastic": torch.tensor([False]).repeat(n_episodes)}
)
self.evaluation_batcher.reset(agent_info=agent_info)
self.evaluation_batcher.execute()