def run_episode()

in DownstreamRL/TrainZPolicyRL.py [0:0]


	def run_episode(self, counter):

		# For number of epochs:
		#   # 1) Given start and goal (for reaching task, say)
		#   # 2) Run Z_Policy on start and goal to retrieve predicted Zs.
		#   # 3) Decode predicted Zs into trajectory. 
		#   # 4) Retrieve "actions" from trajectory. 
		#   # 5) Feed "actions" into RL environment and collect reward. 
		#   # 6) Train ZPolicy to maximize cummulative reward with favorite RL algorithm. 

		# Reset environment. 
		state = self.environment.reset()
		terminal = False
		reward_traj = None		
		state_traj_torch = None
		t_out = 0
		stop = False
		hidden = None
		latent_z_seq = None
		stop_prob_seq = None
		stop_seq = None
		log_prob_seq = None
		kld_loss_seq = 0.
		previous_state = None

		while terminal==False and stop==False:
			
			########################################################
			######## 1) Collect input for first timestep. ##########
			########################################################
			zpolicy_input = np.concatenate([state['robot-state'],state['object-state']]).reshape(1,self.zpolicy_input_size)

			########################################################
			# 2) Feed into the Z policy to retrieve the predicted Z.
			######################################################## 
			latent_z, stop_probability, stop, log_prob, kld_loss, hidden = self.z_policy.forward(zpolicy_input, hidden=hidden)
			latent_z = latent_z.squeeze(1)

			########################################################
			############## 3) Decode into trajectory. ##############
			########################################################

			primitive_and_skill_stop_prob = self.evaluator.model.primitive_decoder(latent_z)			
			traj_seg = primitive_and_skill_stop_prob[0].squeeze(1).detach().cpu().numpy()					

			if previous_state is None:
				previous_state = traj_seg[-1].reshape(1,self.opts.n_state)
			else:
				# Concatenate previous state to trajectory, so that when we take actions we get an action from previous segment to the current one. 
				traj_seg = np.concatenate([previous_state,traj_seg],axis=0)
				previous_state = traj_seg[-1].reshape(-1,self.opts.n_state)

			########################################################
			## 4) Finite diff along time axis to retrieve actions ##
			########################################################
			actions = np.diff(traj_seg,axis=0)
			actions = self.reorder_actions(actions)
			actions_torch = torch.tensor(actions).cuda().float()
		
			cummulative_reward_in_segment = 0.			
			# Run step into evironment for all actions in this segment. 
			t = 0
			while t<actions_torch.shape[0] and terminal==False:
				
				# Step. 
				state, onestep_reward, terminal, success = self.environment.step(actions[t])		

				# Collect onestep_rewards within this segment. 
				cummulative_reward_in_segment += float(onestep_reward)
				# Assuming we have fixed_ns (i.e. novariable_ns), we can use the set decoding length of primitives to assign cummulative reward-to-go values to the various predicted Z variables. 
				# (This is also why we need the reward history, and not just the cummulative rewards obtained over the course of training.
				
				t+=1 

			# Everything is going to be set to None, so set variables. 
			# Do some bookkeeping in life.
			if t_out==0:
				state_traj_torch = torch.tensor(zpolicy_input).cuda().float().view(-1,self.zpolicy_input_size)
				latent_z_seq = latent_z.view(-1,self.opts.nz)
				stop_seq = stop.clone().detach().view(-1,1)
				stop_prob_seq = stop_probability.view(-1,2)
				log_prob_seq = log_prob.view(-1,1)
				# reward_traj = torch.tensor(copy.deepcopy(cummulative_reward_in_segment)).cuda().float().view(-1,1)
				reward_traj = np.array(cummulative_reward_in_segment).reshape((1,1))
			else:
				state_traj_torch = torch.cat([state_traj_torch, torch.tensor(zpolicy_input).cuda().float().view(-1,self.zpolicy_input_size)],dim=0)
				latent_z_seq = torch.cat([latent_z_seq, latent_z.view(-1,self.opts.nz)], dim=0)				
				stop_seq = torch.cat([stop_seq, stop.view(-1,1)], dim=0)
				stop_prob_seq = torch.cat([stop_prob_seq, stop_probability.view(-1,2)], dim=0)
				log_prob_seq = torch.cat([log_prob_seq, log_prob.view(-1,1)], dim=0)
				# reward_traj = torch.cat([reward_traj.view(-1,1), torch.tensor(copy.deepcopy(cummulative_reward_in_segment)).cuda().float().view(-1,1)])
				reward_traj = np.concatenate([reward_traj, np.array(cummulative_reward_in_segment).reshape((1,1))], axis=0)

			# Either way: 
			kld_loss_seq += kld_loss
			t_out += 1 	
			# print(t_out)			

			# Set to false by default. 
			if self.opts.variable_nseg==False:
				stop = False

			if t_out>=self.maximum_skills:
				stop = True

			# if self.opts.debug==True:
			# 	embed()

		if self.opts.train:
			# 6) Feed states, actions, reward, and predicted Zs to update. (These are all lists of tensors.)
			# self.update_networks(state_traj_torch, action_torch, reward_traj, latent_zs)
			self.update_networks(state_traj_torch, reward_traj, latent_z_seq, log_prob_seq, stop_prob_seq, stop_seq, kld_loss_seq)
			self.update_plots(counter)