in qlearn/envs/nchain.py [0:0]
def step(self, action):
assert self.action_space.contains(action)
v = np.arange(self.n)
reward = lambda s, a: 1.0 if (s == (self.n - 1) and a == 1) else (0.001 if (s == 0 and a == 0) else 0)
is_done = lambda nsteps: nsteps >= self.max_nsteps
r = reward(self.state, action)
if action: # forward
if self.state != self.n - 1:
self.state += 1
else: # backward
if self.state != 0:
self.state -= 1
self.nsteps += 1
return (v <= self.state).astype('float32'), r, is_done(self.nsteps), None