in ss_baselines/av_nav/ppo/ppo_trainer.py [0:0]
def train(self) -> None:
r"""Main method for training PPO.
Returns:
None
"""
logger.info(f"config: {self.config}")
random.seed(self.config.SEED)
np.random.seed(self.config.SEED)
torch.manual_seed(self.config.SEED)
self.envs = construct_envs(
self.config, get_env_class(self.config.ENV_NAME)
)
ppo_cfg = self.config.RL.PPO
self.device = (
torch.device("cuda", self.config.TORCH_GPU_ID)
if torch.cuda.is_available()
else torch.device("cpu")
)
if not os.path.isdir(self.config.CHECKPOINT_FOLDER):
os.makedirs(self.config.CHECKPOINT_FOLDER)
self._setup_actor_critic_agent(ppo_cfg)
logger.info(
"agent number of parameters: {}".format(
sum(param.numel() for param in self.agent.parameters())
)
)
rollouts = RolloutStorage(
ppo_cfg.num_steps,
self.envs.num_envs,
self.envs.observation_spaces[0],
self.envs.action_spaces[0],
ppo_cfg.hidden_size,
)
rollouts.to(self.device)
observations = self.envs.reset()
batch = batch_obs(observations)
for sensor in rollouts.observations:
rollouts.observations[sensor][0].copy_(batch[sensor])
# batch and observations may contain shared PyTorch CUDA
# tensors. We must explicitly clear them here otherwise
# they will be kept in memory for the entire duration of training!
batch = None
observations = None
# episode_rewards and episode_counts accumulates over the entire training course
episode_rewards = torch.zeros(self.envs.num_envs, 1)
episode_spls = torch.zeros(self.envs.num_envs, 1)
episode_steps = torch.zeros(self.envs.num_envs, 1)
episode_counts = torch.zeros(self.envs.num_envs, 1)
current_episode_reward = torch.zeros(self.envs.num_envs, 1)
current_episode_step = torch.zeros(self.envs.num_envs, 1)
window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_spl = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_step = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size)
t_start = time.time()
env_time = 0
pth_time = 0
count_steps = 0
count_checkpoints = 0
lr_scheduler = LambdaLR(
optimizer=self.agent.optimizer,
lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES),
)
with TensorboardWriter(
self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs
) as writer:
for update in range(self.config.NUM_UPDATES):
if ppo_cfg.use_linear_lr_decay:
lr_scheduler.step()
if ppo_cfg.use_linear_clip_decay:
self.agent.clip_param = ppo_cfg.clip_param * linear_decay(
update, self.config.NUM_UPDATES
)
for step in range(ppo_cfg.num_steps):
delta_pth_time, delta_env_time, delta_steps = self._collect_rollout_step(
rollouts,
current_episode_reward,
current_episode_step,
episode_rewards,
episode_spls,
episode_counts,
episode_steps
)
pth_time += delta_pth_time
env_time += delta_env_time
count_steps += delta_steps
delta_pth_time, value_loss, action_loss, dist_entropy = self._update_agent(
ppo_cfg, rollouts
)
pth_time += delta_pth_time
window_episode_reward.append(episode_rewards.clone())
window_episode_spl.append(episode_spls.clone())
window_episode_step.append(episode_steps.clone())
window_episode_counts.append(episode_counts.clone())
losses = [value_loss, action_loss, dist_entropy]
stats = zip(
["count", "reward", "step", 'spl'],
[window_episode_counts, window_episode_reward, window_episode_step, window_episode_spl],
)
deltas = {
k: (
(v[-1] - v[0]).sum().item()
if len(v) > 1
else v[0].sum().item()
)
for k, v in stats
}
deltas["count"] = max(deltas["count"], 1.0)
# this reward is averaged over all the episodes happened during window_size updates
# approximately number of steps is window_size * num_steps
if update % 10 == 0:
writer.add_scalar("Environment/Reward", deltas["reward"] / deltas["count"], count_steps)
writer.add_scalar("Environment/SPL", deltas["spl"] / deltas["count"], count_steps)
writer.add_scalar("Environment/Episode_length", deltas["step"] / deltas["count"], count_steps)
writer.add_scalar('Policy/Value_Loss', value_loss, count_steps)
writer.add_scalar('Policy/Action_Loss', action_loss, count_steps)
writer.add_scalar('Policy/Entropy', dist_entropy, count_steps)
writer.add_scalar('Policy/Learning_Rate', lr_scheduler.get_lr()[0], count_steps)
# log stats
if update > 0 and update % self.config.LOG_INTERVAL == 0:
logger.info(
"update: {}\tfps: {:.3f}\t".format(
update, count_steps / (time.time() - t_start)
)
)
logger.info(
"update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t"
"frames: {}".format(
update, env_time, pth_time, count_steps
)
)
window_rewards = (
window_episode_reward[-1] - window_episode_reward[0]
).sum()
window_counts = (
window_episode_counts[-1] - window_episode_counts[0]
).sum()
if window_counts > 0:
logger.info(
"Average window size {} reward: {:3f}".format(
len(window_episode_reward),
(window_rewards / window_counts).item(),
)
)
else:
logger.info("No episodes finish in current window")
# checkpoint model
if update % self.config.CHECKPOINT_INTERVAL == 0:
self.save_checkpoint(f"ckpt.{count_checkpoints}.pth")
count_checkpoints += 1
self.envs.close()