in baselines/her/rollout.py [0:0]
def generate_rollouts(self):
"""Performs `rollout_batch_size` rollouts in parallel for time horizon `T` with the current
policy acting on it accordingly.
"""
self.reset_all_rollouts()
# compute observations
o = np.empty((self.rollout_batch_size, self.dims['o']), np.float32) # observations
ag = np.empty((self.rollout_batch_size, self.dims['g']), np.float32) # achieved goals
o[:] = self.initial_o
ag[:] = self.initial_ag
# generate episodes
obs, achieved_goals, acts, goals, successes = [], [], [], [], []
dones = []
info_values = [np.empty((self.T - 1, self.rollout_batch_size, self.dims['info_' + key]), np.float32) for key in self.info_keys]
Qs = []
for t in range(self.T):
policy_output = self.policy.get_actions(
o, ag, self.g,
compute_Q=self.compute_Q,
noise_eps=self.noise_eps if not self.exploit else 0.,
random_eps=self.random_eps if not self.exploit else 0.,
use_target_net=self.use_target_net)
if self.compute_Q:
u, Q = policy_output
Qs.append(Q)
else:
u = policy_output
if u.ndim == 1:
# The non-batched case should still have a reasonable shape.
u = u.reshape(1, -1)
o_new = np.empty((self.rollout_batch_size, self.dims['o']))
ag_new = np.empty((self.rollout_batch_size, self.dims['g']))
success = np.zeros(self.rollout_batch_size)
# compute new states and observations
obs_dict_new, _, done, info = self.venv.step(u)
o_new = obs_dict_new['observation']
ag_new = obs_dict_new['achieved_goal']
success = np.array([i.get('is_success', 0.0) for i in info])
if any(done):
# here we assume all environments are done is ~same number of steps, so we terminate rollouts whenever any of the envs returns done
# trick with using vecenvs is not to add the obs from the environments that are "done", because those are already observations
# after a reset
break
for i, info_dict in enumerate(info):
for idx, key in enumerate(self.info_keys):
info_values[idx][t, i] = info[i][key]
if np.isnan(o_new).any():
self.logger.warn('NaN caught during rollout generation. Trying again...')
self.reset_all_rollouts()
return self.generate_rollouts()
dones.append(done)
obs.append(o.copy())
achieved_goals.append(ag.copy())
successes.append(success.copy())
acts.append(u.copy())
goals.append(self.g.copy())
o[...] = o_new
ag[...] = ag_new
obs.append(o.copy())
achieved_goals.append(ag.copy())
episode = dict(o=obs,
u=acts,
g=goals,
ag=achieved_goals)
for key, value in zip(self.info_keys, info_values):
episode['info_{}'.format(key)] = value
# stats
successful = np.array(successes)[-1, :]
assert successful.shape == (self.rollout_batch_size,)
success_rate = np.mean(successful)
self.success_history.append(success_rate)
if self.compute_Q:
self.Q_history.append(np.mean(Qs))
self.n_episodes += self.rollout_batch_size
return convert_episode_to_batch_major(episode)