in Experiments/PolicyManagers.py [0:0]
def rollout_latent_policy(self, orig_assembled_inputs, orig_subpolicy_inputs):
assembled_inputs = orig_assembled_inputs.clone().detach()
subpolicy_inputs = orig_subpolicy_inputs.clone().detach()
# Set the previous b time to 0.
delta_t = 0
# For number of rollout timesteps:
for t in range(self.rollout_timesteps-1):
##########################################
#### CODE FOR NEW Z SELECTION ROLLOUT ####
##########################################
# Pick latent_z and latent_b.
selected_b, new_selected_z = self.latent_policy.get_actions(assembled_inputs[:(t+1)].view((t+1,-1)), greedy=True, delta_t=delta_t)
if t==0:
selected_b = torch.ones_like(selected_b).to(device).float()
if selected_b[-1]==1:
# Copy over ALL z's. This is okay to do because we're greedily selecting, and hte latent policy is hence deterministic.
selected_z = torch.tensor(new_selected_z).to(device).float()
# If b was == 1, then... reset b to 0.
delta_t = 0
else:
# Increment counter since last time b was 1.
delta_t += 1
# Set z's to 0.
assembled_inputs[t+1, self.input_size:self.input_size+self.number_policies] = 0.
# Set z and b in assembled input for the future latent policy passes.
if self.args.discrete_z:
assembled_inputs[t+1, self.input_size+selected_z[-1]] = 1.
else:
assembled_inputs[t+1, self.input_size:self.input_size+self.latent_z_dimensionality] = selected_z[-1]
# This was also using wrong dimensions... oops :P
assembled_inputs[t+1, self.input_size+self.latent_z_dimensionality] = selected_b[-1]
# Before copying over, set conditional_info from the environment at the current timestep.
if self.conditional_viz_env:
self.set_env_conditional_info()
if self.conditional_info_size>0:
assembled_inputs[t+1, -self.conditional_info_size:] = torch.tensor(self.conditional_information).to(device).float()
# Set z's to 0.
subpolicy_inputs[t, self.input_size:self.input_size+self.number_policies] = 0.
# Set z and b in subpolicy input for the future subpolicy passes.
if self.args.discrete_z:
subpolicy_inputs[t, self.input_size+selected_z[-1]] = 1.
else:
subpolicy_inputs[t, self.input_size:] = selected_z[-1]
# Now pass subpolicy net forward and get action and next state.
action_to_execute, new_state = self.take_rollout_step(subpolicy_inputs[:(t+1)].view((t+1,-1)), t, use_env=self.conditional_viz_env)
state_action_tuple = torch.cat([new_state, action_to_execute],dim=1)
# Now update assembled input.
assembled_inputs[t+1, :self.input_size] = state_action_tuple
subpolicy_inputs[t+1, :self.input_size] = state_action_tuple
self.latent_trajectory_rollout = copy.deepcopy(subpolicy_inputs[:,:self.state_dim].detach().cpu().numpy())
concatenated_selected_b = np.concatenate([selected_b.detach().cpu().numpy(),np.zeros((1))],axis=-1)
if self.args.debug:
print("Embedding in Latent Policy Rollout.")
embed()
# Clear these variables from memory.
del subpolicy_inputs, assembled_inputs
return concatenated_selected_b