in policy.py [0:0]
def __init__(self, scope, *, ob_space, ac_space, hiddens, reuse=False, normalize=False):
self.recurrent = True
self.normalized = normalize
with tf.variable_scope(scope, reuse=reuse):
self.scope = tf.get_variable_scope().name
assert isinstance(ob_space, gym.spaces.Box)
self.observation_ph = tf.placeholder(tf.float32, [None, None] + list(ob_space.shape), name="observation")
self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
self.taken_action_ph = tf.placeholder(dtype=tf.float32, shape=[None, None, ac_space.shape[0]], name="taken_action")
if self.normalized:
if self.normalized != 'ob':
self.ret_rms = RunningMeanStd(scope="retfilter")
self.ob_rms = RunningMeanStd(shape=ob_space.shape, scope="obsfilter")
obz = self.observation_ph
if self.normalized:
obz = tf.clip_by_value((self.observation_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
last_out = obz
for hidden in hiddens[:-1]:
last_out = tf.contrib.layers.fully_connected(last_out, hidden)
self.zero_state = []
self.state_in_ph = []
self.state_out = []
cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
size = cell.state_size
self.zero_state.append(np.zeros(size.c, dtype=np.float32))
self.zero_state.append(np.zeros(size.h, dtype=np.float32))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmv_c"))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmv_h"))
initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])
last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmv")
self.state_out.append(state_out)
self.vpredz = tf.contrib.layers.fully_connected(last_out, 1, activation_fn=None)[:, :, 0]
self.vpred = self.vpredz
if self.normalized and self.normalized != 'ob':
self.vpred = self.vpredz * self.ret_rms.std + self.ret_rms.mean # raw = not standardized
last_out = obz
for hidden in hiddens[:-1]:
last_out = tf.contrib.layers.fully_connected(last_out, hidden)
cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
size = cell.state_size
self.zero_state.append(np.zeros(size.c, dtype=np.float32))
self.zero_state.append(np.zeros(size.h, dtype=np.float32))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmp_c"))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmp_h"))
initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])
last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmp")
self.state_out.append(state_out)
mean = tf.contrib.layers.fully_connected(last_out, ac_space.shape[0], activation_fn=None)
logstd = tf.get_variable(name="logstd", shape=[1, ac_space.shape[0]], initializer=tf.zeros_initializer())
self.pd = DiagonalGaussian(mean, logstd)
self.sampled_action = switch(self.stochastic_ph, self.pd.sample(), self.pd.mode())
self.zero_state = np.array(self.zero_state)
self.state_in_ph = tuple(self.state_in_ph)
self.state = self.zero_state
for p in self.get_trainable_variables():
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.reduce_sum(tf.square(p)))