in svg/replay_buffer.py [0:0]
def sample_multistep(self, batch_size, T):
assert batch_size < self.idx or self.full
last_idx = self.capacity if self.full else self.idx
last_idx -= T
# raw here means the "coalesced" indices that map to valid
# indicies that are more than T steps away from a done
done_idxs_sorted = np.array(list(self.done_idxs) + [last_idx])
n_done = len(done_idxs_sorted)
done_idxs_raw = done_idxs_sorted - np.arange(1, n_done+1)*T
samples_raw = npr.choice(
last_idx-(T+1)*n_done, size=batch_size,
replace=True # for speed
)
samples_raw = sorted(samples_raw)
js = np.searchsorted(done_idxs_raw, samples_raw)
offsets = done_idxs_raw[js] - samples_raw + T
start_idxs = done_idxs_sorted[js] - offsets
obses, actions, rewards = [], [], []
for t in range(T):
obses.append(self.obses[start_idxs + t])
actions.append(self.actions[start_idxs + t])
rewards.append(self.rewards[start_idxs + t])
assert np.all(self.not_dones[start_idxs + t])
obses = np.stack(obses)
actions = np.stack(actions)
rewards = np.stack(rewards).squeeze(2)
if self.normalize_obs:
mu, sigma = self.get_obs_stats()
obses = (obses-mu)/sigma
obses = torch.as_tensor(obses, device=self.device).float()
actions = torch.as_tensor(actions, device=self.device)
rewards = torch.as_tensor(rewards, device=self.device)
return obses, actions, rewards