in DownstreamRL/TrainZPolicyRL.py [0:0]
def run_episode(self, counter):
# For number of epochs:
# # 1) Given start and goal (for reaching task, say)
# # 2) Run Z_Policy on start and goal to retrieve predicted Zs.
# # 3) Decode predicted Zs into trajectory.
# # 4) Retrieve "actions" from trajectory.
# # 5) Feed "actions" into RL environment and collect reward.
# # 6) Train ZPolicy to maximize cummulative reward with favorite RL algorithm.
# Reset environment.
state = self.environment.reset()
terminal = False
reward_traj = None
state_traj_torch = None
t_out = 0
stop = False
hidden = None
latent_z_seq = None
stop_prob_seq = None
stop_seq = None
log_prob_seq = None
kld_loss_seq = 0.
previous_state = None
while terminal==False and stop==False:
########################################################
######## 1) Collect input for first timestep. ##########
########################################################
zpolicy_input = np.concatenate([state['robot-state'],state['object-state']]).reshape(1,self.zpolicy_input_size)
########################################################
# 2) Feed into the Z policy to retrieve the predicted Z.
########################################################
latent_z, stop_probability, stop, log_prob, kld_loss, hidden = self.z_policy.forward(zpolicy_input, hidden=hidden)
latent_z = latent_z.squeeze(1)
########################################################
############## 3) Decode into trajectory. ##############
########################################################
primitive_and_skill_stop_prob = self.evaluator.model.primitive_decoder(latent_z)
traj_seg = primitive_and_skill_stop_prob[0].squeeze(1).detach().cpu().numpy()
if previous_state is None:
previous_state = traj_seg[-1].reshape(1,self.opts.n_state)
else:
# Concatenate previous state to trajectory, so that when we take actions we get an action from previous segment to the current one.
traj_seg = np.concatenate([previous_state,traj_seg],axis=0)
previous_state = traj_seg[-1].reshape(-1,self.opts.n_state)
########################################################
## 4) Finite diff along time axis to retrieve actions ##
########################################################
actions = np.diff(traj_seg,axis=0)
actions = self.reorder_actions(actions)
actions_torch = torch.tensor(actions).cuda().float()
cummulative_reward_in_segment = 0.
# Run step into evironment for all actions in this segment.
t = 0
while t<actions_torch.shape[0] and terminal==False:
# Step.
state, onestep_reward, terminal, success = self.environment.step(actions[t])
# Collect onestep_rewards within this segment.
cummulative_reward_in_segment += float(onestep_reward)
# Assuming we have fixed_ns (i.e. novariable_ns), we can use the set decoding length of primitives to assign cummulative reward-to-go values to the various predicted Z variables.
# (This is also why we need the reward history, and not just the cummulative rewards obtained over the course of training.
t+=1
# Everything is going to be set to None, so set variables.
# Do some bookkeeping in life.
if t_out==0:
state_traj_torch = torch.tensor(zpolicy_input).cuda().float().view(-1,self.zpolicy_input_size)
latent_z_seq = latent_z.view(-1,self.opts.nz)
stop_seq = stop.clone().detach().view(-1,1)
stop_prob_seq = stop_probability.view(-1,2)
log_prob_seq = log_prob.view(-1,1)
# reward_traj = torch.tensor(copy.deepcopy(cummulative_reward_in_segment)).cuda().float().view(-1,1)
reward_traj = np.array(cummulative_reward_in_segment).reshape((1,1))
else:
state_traj_torch = torch.cat([state_traj_torch, torch.tensor(zpolicy_input).cuda().float().view(-1,self.zpolicy_input_size)],dim=0)
latent_z_seq = torch.cat([latent_z_seq, latent_z.view(-1,self.opts.nz)], dim=0)
stop_seq = torch.cat([stop_seq, stop.view(-1,1)], dim=0)
stop_prob_seq = torch.cat([stop_prob_seq, stop_probability.view(-1,2)], dim=0)
log_prob_seq = torch.cat([log_prob_seq, log_prob.view(-1,1)], dim=0)
# reward_traj = torch.cat([reward_traj.view(-1,1), torch.tensor(copy.deepcopy(cummulative_reward_in_segment)).cuda().float().view(-1,1)])
reward_traj = np.concatenate([reward_traj, np.array(cummulative_reward_in_segment).reshape((1,1))], axis=0)
# Either way:
kld_loss_seq += kld_loss
t_out += 1
# print(t_out)
# Set to false by default.
if self.opts.variable_nseg==False:
stop = False
if t_out>=self.maximum_skills:
stop = True
# if self.opts.debug==True:
# embed()
if self.opts.train:
# 6) Feed states, actions, reward, and predicted Zs to update. (These are all lists of tensors.)
# self.update_networks(state_traj_torch, action_torch, reward_traj, latent_zs)
self.update_networks(state_traj_torch, reward_traj, latent_z_seq, log_prob_seq, stop_prob_seq, stop_seq, kld_loss_seq)
self.update_plots(counter)