in replay_buffer.py [0:0]
def _sample(self):
try:
self._try_fetch()
except:
traceback.print_exc()
self._samples_since_last_fetch += 1
episode = self._sample_episode()
# add +1 for the first dummy transition
idx = np.random.randint(0, episode_len(episode) - self._nstep + 1) + 1
obs = episode['observation'][idx - 1]
action = episode['action'][idx]
next_obs = episode['observation'][idx + self._nstep - 1]
reward = np.zeros_like(episode['reward'][idx])
discount = np.ones_like(episode['discount'][idx])
for i in range(self._nstep):
step_reward = episode['reward'][idx + i]
reward += discount * step_reward
discount *= episode['discount'][idx + i] * self._discount
return (obs, action, reward, discount, next_obs)