train_exploration.py [280:340]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            overall_reward = (
                reward * (1 - args.reward_scale)
                + reward_exploration * args.reward_scale
            )

            # Update statistics
            episode_expl_rewards += reward_exploration.numpy() * args.reward_scale

            # Update rollouts_policy
            rollouts_policy.insert(
                obs_im,
                obs_sm,
                obs_lm,
                recurrent_hidden_states,
                action,
                action_log_probs,
                value,
                overall_reward,
                masks,
                obs_collns,
            )

            # Update prev values
            prev_collision = obs_collns
            prev_action = action
            episode_collisions += obs_collns.cpu().numpy()

            # Update RL policy
            if (step + 1) % args.num_rl_steps == 0:
                # Update value function for last step
                with torch.no_grad():
                    encoder_inputs = [rollouts_policy.obs_im[-1]]
                    if args.encoder_type == "rgb+map":
                        encoder_inputs.append(rollouts_policy.obs_sm[-1])
                        encoder_inputs.append(rollouts_policy.obs_lm[-1])
                    obs_feats = encoder(*encoder_inputs)
                    policy_inputs = {"features": obs_feats}
                    if args.use_action_embedding:
                        policy_inputs["actions"] = prev_action.long()
                    if args.use_collision_embedding:
                        policy_inputs["collisions"] = prev_collision.long()
                    next_value = actor_critic.get_value(
                        policy_inputs,
                        rollouts_policy.recurrent_hidden_states[-1],
                        rollouts_policy.masks[-1],
                    ).detach()
                # Compute returns
                rollouts_policy.compute_returns(
                    next_value, args.use_gae, args.gamma, args.tau
                )
                encoder.train()
                actor_critic.train()
                # Update model
                rl_losses = rl_agent.update(rollouts_policy)
                # Refresh rollouts
                rollouts_policy.after_update()
                encoder.eval()
                actor_critic.eval()

        # =================== Save model ====================
        if (j + 1) % args.save_interval == 0 and args.save_dir != "":
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


train_reconstruction_exploration.py [414:475]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            overall_reward = (
                reward * (1 - args.reward_scale)
                + reward_exploration * args.reward_scale
            )

            # Update statistics
            episode_expl_rewards += reward_exploration.numpy() * args.reward_scale

            # Update rollouts_policy
            rollouts_policy.insert(
                obs_im,
                obs_sm,
                obs_lm,
                recurrent_hidden_states,
                action,
                action_log_probs,
                value,
                overall_reward,
                masks,
                obs_collns,
            )

            # Update prev values
            prev_collision = obs_collns
            prev_action = action
            episode_collisions += obs_collns.cpu().numpy()

            # Update RL policy
            if (step + 1) % args.num_rl_steps == 0:
                # Update value function for last step
                with torch.no_grad():
                    encoder_inputs = [rollouts_policy.obs_im[-1]]
                    if args.encoder_type == "rgb+map":
                        encoder_inputs.append(rollouts_policy.obs_sm[-1])
                        encoder_inputs.append(rollouts_policy.obs_lm[-1])
                    obs_feats = encoder(*encoder_inputs)
                    policy_inputs = {"features": obs_feats}
                    if args.use_action_embedding:
                        policy_inputs["actions"] = prev_action.long()
                    if args.use_collision_embedding:
                        policy_inputs["collisions"] = prev_collision.long()
                    next_value = actor_critic.get_value(
                        policy_inputs,
                        rollouts_policy.recurrent_hidden_states[-1],
                        rollouts_policy.masks[-1],
                    ).detach()
                # Compute returns
                rollouts_policy.compute_returns(
                    next_value, args.use_gae, args.gamma, args.tau
                )

                encoder.train()
                actor_critic.train()
                # Update model
                rl_losses = rl_agent.update(rollouts_policy)
                # Refresh rollouts_policy
                rollouts_policy.after_update()
                encoder.eval()
                actor_critic.eval()

        # =================== Save model ====================
        if (j + 1) % args.save_interval == 0 and args.save_dir != "":
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -