in ss_baselines/av_wan/ppo/ppo_trainer.py [0:0]
def train(self) -> None:
r"""Main method for training PPO.
Returns:
None
"""
global lr_lambda
logger.info(f"config: {self.config}")
random.seed(self.config.SEED)
np.random.seed(self.config.SEED)
torch.manual_seed(self.config.SEED)
self.envs = construct_envs(
self.config, get_env_class(self.config.ENV_NAME), auto_reset_done=False
)
ppo_cfg = self.config.RL.PPO
self.device = (
torch.device("cuda", self.config.TORCH_GPU_ID)
if torch.cuda.is_available()
else torch.device("cpu")
)
if not os.path.isdir(self.config.CHECKPOINT_FOLDER):
os.makedirs(self.config.CHECKPOINT_FOLDER)
self._setup_actor_critic_agent(ppo_cfg)
logger.info(
"agent number of parameters: {}".format(
sum(param.numel() for param in self.agent.parameters())
)
)
rollouts = RolloutStorage(
ppo_cfg.num_steps,
self.envs.num_envs,
self.envs.observation_spaces[0],
self.envs.action_spaces[0],
ppo_cfg.hidden_size
)
rollouts.to(self.device)
observations = self.envs.reset()
batch = batch_obs(observations)
for sensor in rollouts.observations:
rollouts.observations[sensor][0].copy_(batch[sensor])
# batch and observations may contain shared PyTorch CUDA
# tensors. We must explicitly clear them here otherwise
# they will be kept in memory for the entire duration of training!
batch = None
observations = None
# episode_rewards and episode_counts accumulates over the entire training course
episode_rewards = torch.zeros(self.envs.num_envs, 1)
episode_spls = torch.zeros(self.envs.num_envs, 1)
episode_steps = torch.zeros(self.envs.num_envs, 1)
episode_counts = torch.zeros(self.envs.num_envs, 1)
episode_distances = torch.zeros(self.envs.num_envs, 1)
current_episode_reward = torch.zeros(self.envs.num_envs, 1)
current_episode_step = torch.zeros(self.envs.num_envs, 1)
window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_spl = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_step = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size)
window_episode_distances = deque(maxlen=ppo_cfg.reward_window_size)
t_start = time.time()
env_time = 0
pth_time = 0
count_steps = 0
count_checkpoints = 0
start_update = 0
prev_time = 0
if ppo_cfg.use_linear_lr_decay:
def lr_lambda(x):
return linear_decay(x, self.config.NUM_UPDATES)
elif ppo_cfg.use_exponential_lr_decay:
def lr_lambda(x):
return exponential_decay(x, self.config.NUM_UPDATES, ppo_cfg.exp_decay_lambda)
else:
def lr_lambda(x):
return 1
lr_scheduler = LambdaLR(
optimizer=self.agent.optimizer,
lr_lambda=lr_lambda
)
with TensorboardWriter(
self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs
) as writer:
for update in range(start_update, self.config.NUM_UPDATES):
if ppo_cfg.use_linear_lr_decay or ppo_cfg.use_exponential_lr_decay:
lr_scheduler.step()
if ppo_cfg.use_linear_clip_decay:
self.agent.clip_param = ppo_cfg.clip_param * linear_decay(
update, self.config.NUM_UPDATES
)
for step in range(ppo_cfg.num_steps):
delta_pth_time, delta_env_time, delta_steps = self._collect_rollout_step(
rollouts,
current_episode_reward,
current_episode_step,
episode_rewards,
episode_spls,
episode_counts,
episode_steps,
episode_distances
)
pth_time += delta_pth_time
env_time += delta_env_time
count_steps += delta_steps
delta_pth_time, value_loss, action_loss, dist_entropy = self._update_agent(
ppo_cfg, rollouts
)
pth_time += delta_pth_time
window_episode_reward.append(episode_rewards.clone())
window_episode_spl.append(episode_spls.clone())
window_episode_step.append(episode_steps.clone())
window_episode_counts.append(episode_counts.clone())
window_episode_distances.append(episode_distances.clone())
losses = [value_loss, action_loss, dist_entropy]
stats = zip(
["count", "reward", "step", 'spl', 'distance'],
[window_episode_counts, window_episode_reward, window_episode_step, window_episode_spl,
window_episode_distances],
)
deltas = {
k: (
(v[-1] - v[0]).sum().item()
if len(v) > 1
else v[0].sum().item()
)
for k, v in stats
}
deltas["count"] = max(deltas["count"], 1.0)
# this reward is averaged over all the episodes happened during window_size updates
# approximately number of steps is window_size * num_steps
writer.add_scalar(
"Environment/Reward", deltas["reward"] / deltas["count"], count_steps
)
writer.add_scalar(
"Environment/SPL", deltas["spl"] / deltas["count"], count_steps
)
logging.debug('Number of steps: {}'.format(deltas["step"] / deltas["count"]))
writer.add_scalar(
"Environment/Episode_length", deltas["step"] / deltas["count"], count_steps
)
writer.add_scalar(
"Environment/Distance_to_goal", deltas["distance"] / deltas["count"], count_steps
)
# writer.add_scalars(
# "losses",
# {k: l for l, k in zip(losses, ["value", "policy"])},
# count_steps,
# )
writer.add_scalar(
'Policy/Value_Loss', value_loss, count_steps
)
writer.add_scalar(
'Policy/Action_Loss', action_loss, count_steps
)
writer.add_scalar(
'Policy/Entropy', dist_entropy, count_steps
)
writer.add_scalar(
'Policy/Learning_Rate', lr_scheduler.get_lr()[0], count_steps
)
# log stats
if update > 0 and update % self.config.LOG_INTERVAL == 0:
logger.info(
"update: {}\tfps: {:.3f}\t".format(
update, count_steps / ((time.time() - t_start) + prev_time)
)
)
logger.info(
"update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t"
"frames: {}".format(
update, env_time, pth_time, count_steps
)
)
window_rewards = (
window_episode_reward[-1] - window_episode_reward[0]
).sum()
window_counts = (
window_episode_counts[-1] - window_episode_counts[0]
).sum()
if window_counts > 0:
logger.info(
"Average window size {} reward: {:3f}".format(
len(window_episode_reward),
(window_rewards / window_counts).item(),
)
)
else:
logger.info("No episodes finish in current window")
# checkpoint model
if update % self.config.CHECKPOINT_INTERVAL == 0:
self.save_checkpoint(f"ckpt.{count_checkpoints}.pth")
count_checkpoints += 1
self.envs.close()