in policy.py [0:0]
def act(self, observation, stochastic=True):
outputs = [self.sampled_action, self.vpred, self.state_out]
a, v, s = tf.get_default_session().run(outputs, {
self.observation_ph: observation[None, None],
self.state_in_ph: list(self.state[:, None, :]),
self.stochastic_ph: stochastic})
self.state = []
for x in s:
self.state.append(x.c[0])
self.state.append(x.h[0])
self.state = np.array(self.state)
return a[0, 0], {'vpred': v[0, 0], 'state': self.state}