policy.py (185 lines of code) (raw):
import tensorflow as tf
import numpy as np
import gym
import logging
import copy
from tensorflow.contrib import layers
class Policy(object):
def reset(self, **kwargs):
pass
def act(self, observation):
# should return act, info
raise NotImplementedError()
class RunningMeanStd(object):
def __init__(self, scope="running", reuse=False, epsilon=1e-2, shape=()):
with tf.variable_scope(scope, reuse=reuse):
self._sum = tf.get_variable(
dtype=tf.float32,
shape=shape,
initializer=tf.constant_initializer(0.0),
name="sum", trainable=False)
self._sumsq = tf.get_variable(
dtype=tf.float32,
shape=shape,
initializer=tf.constant_initializer(epsilon),
name="sumsq", trainable=False)
self._count = tf.get_variable(
dtype=tf.float32,
shape=(),
initializer=tf.constant_initializer(epsilon),
name="count", trainable=False)
self.shape = shape
self.mean = tf.to_float(self._sum / self._count)
var_est = tf.to_float(self._sumsq / self._count) - tf.square(self.mean)
self.std = tf.sqrt(tf.maximum(var_est, 1e-2))
def dense(x, size, name, weight_init=None, bias=True):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
ret = tf.matmul(x, w)
if bias:
b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
return ret + b
else:
return ret
def switch(condition, if_exp, else_exp):
x_shape = copy.copy(if_exp.get_shape())
x = tf.cond(tf.cast(condition, 'bool'),
lambda: if_exp,
lambda: else_exp)
x.set_shape(x_shape)
return x
class DiagonalGaussian(object):
def __init__(self, mean, logstd):
self.mean = mean
self.logstd = logstd
self.std = tf.exp(logstd)
def sample(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
def mode(self):
return self.mean
class MlpPolicyValue(Policy):
def __init__(self, scope, *, ob_space, ac_space, hiddens, convs=[], reuse=False, normalize=False):
self.recurrent = False
self.normalized = normalize
self.zero_state = np.zeros(1)
with tf.variable_scope(scope, reuse=reuse):
self.scope = tf.get_variable_scope().name
assert isinstance(ob_space, gym.spaces.Box)
self.observation_ph = tf.placeholder(tf.float32, [None] + list(ob_space.shape), name="observation")
self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
self.taken_action_ph = tf.placeholder(dtype=tf.float32, shape=[None, ac_space.shape[0]], name="taken_action")
if self.normalized:
if self.normalized != 'ob':
self.ret_rms = RunningMeanStd(scope="retfilter")
self.ob_rms = RunningMeanStd(shape=ob_space.shape, scope="obsfilter")
obz = self.observation_ph
if self.normalized:
obz = tf.clip_by_value((self.observation_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
last_out = obz
for i, hid_size in enumerate(hiddens):
last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i + 1)))
self.vpredz = dense(last_out, 1, "vffinal")[:, 0]
self.vpred = self.vpredz
if self.normalized and self.normalized != 'ob':
self.vpred = self.vpredz * self.ret_rms.std + self.ret_rms.mean # raw = not standardized
last_out = obz
for i, hid_size in enumerate(hiddens):
last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i + 1)))
mean = dense(last_out, ac_space.shape[0], "polfinal")
logstd = tf.get_variable(name="logstd", shape=[1, ac_space.shape[0]], initializer=tf.zeros_initializer())
self.pd = DiagonalGaussian(mean, logstd)
self.sampled_action = switch(self.stochastic_ph, self.pd.sample(), self.pd.mode())
def make_feed_dict(self, observation, taken_action):
return {
self.observation_ph: observation,
self.taken_action_ph: taken_action
}
def act(self, observation, stochastic=True):
outputs = [self.sampled_action, self.vpred]
a, v = tf.get_default_session().run(outputs, {
self.observation_ph: observation[None],
self.stochastic_ph: stochastic})
return a[0], {'vpred': v[0]}
def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
class LSTMPolicy(Policy):
def __init__(self, scope, *, ob_space, ac_space, hiddens, reuse=False, normalize=False):
self.recurrent = True
self.normalized = normalize
with tf.variable_scope(scope, reuse=reuse):
self.scope = tf.get_variable_scope().name
assert isinstance(ob_space, gym.spaces.Box)
self.observation_ph = tf.placeholder(tf.float32, [None, None] + list(ob_space.shape), name="observation")
self.stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
self.taken_action_ph = tf.placeholder(dtype=tf.float32, shape=[None, None, ac_space.shape[0]], name="taken_action")
if self.normalized:
if self.normalized != 'ob':
self.ret_rms = RunningMeanStd(scope="retfilter")
self.ob_rms = RunningMeanStd(shape=ob_space.shape, scope="obsfilter")
obz = self.observation_ph
if self.normalized:
obz = tf.clip_by_value((self.observation_ph - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
last_out = obz
for hidden in hiddens[:-1]:
last_out = tf.contrib.layers.fully_connected(last_out, hidden)
self.zero_state = []
self.state_in_ph = []
self.state_out = []
cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
size = cell.state_size
self.zero_state.append(np.zeros(size.c, dtype=np.float32))
self.zero_state.append(np.zeros(size.h, dtype=np.float32))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmv_c"))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmv_h"))
initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])
last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmv")
self.state_out.append(state_out)
self.vpredz = tf.contrib.layers.fully_connected(last_out, 1, activation_fn=None)[:, :, 0]
self.vpred = self.vpredz
if self.normalized and self.normalized != 'ob':
self.vpred = self.vpredz * self.ret_rms.std + self.ret_rms.mean # raw = not standardized
last_out = obz
for hidden in hiddens[:-1]:
last_out = tf.contrib.layers.fully_connected(last_out, hidden)
cell = tf.contrib.rnn.BasicLSTMCell(hiddens[-1], reuse=reuse)
size = cell.state_size
self.zero_state.append(np.zeros(size.c, dtype=np.float32))
self.zero_state.append(np.zeros(size.h, dtype=np.float32))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.c], name="lstmp_c"))
self.state_in_ph.append(tf.placeholder(tf.float32, [None, size.h], name="lstmp_h"))
initial_state = tf.contrib.rnn.LSTMStateTuple(self.state_in_ph[-2], self.state_in_ph[-1])
last_out, state_out = tf.nn.dynamic_rnn(cell, last_out, initial_state=initial_state, scope="lstmp")
self.state_out.append(state_out)
mean = tf.contrib.layers.fully_connected(last_out, ac_space.shape[0], activation_fn=None)
logstd = tf.get_variable(name="logstd", shape=[1, ac_space.shape[0]], initializer=tf.zeros_initializer())
self.pd = DiagonalGaussian(mean, logstd)
self.sampled_action = switch(self.stochastic_ph, self.pd.sample(), self.pd.mode())
self.zero_state = np.array(self.zero_state)
self.state_in_ph = tuple(self.state_in_ph)
self.state = self.zero_state
for p in self.get_trainable_variables():
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.reduce_sum(tf.square(p)))
def make_feed_dict(self, observation, state_in, taken_action):
return {
self.observation_ph: observation,
self.state_in_ph: list(np.transpose(state_in, (1, 0, 2))),
self.taken_action_ph: taken_action
}
def act(self, observation, stochastic=True):
outputs = [self.sampled_action, self.vpred, self.state_out]
a, v, s = tf.get_default_session().run(outputs, {
self.observation_ph: observation[None, None],
self.state_in_ph: list(self.state[:, None, :]),
self.stochastic_ph: stochastic})
self.state = []
for x in s:
self.state.append(x.c[0])
self.state.append(x.h[0])
self.state = np.array(self.state)
return a[0, 0], {'vpred': v[0, 0], 'state': self.state}
def get_variables(self):
return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
def get_trainable_variables(self):
return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
def reset(self):
self.state = self.zero_state