spinup/algos/tf1/sac/core.py (50 lines of code) (raw):
import numpy as np
import tensorflow as tf
EPS = 1e-8
def placeholder(dim=None):
return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
def placeholders(*args):
return [placeholder(dim) for dim in args]
def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
for h in hidden_sizes[:-1]:
x = tf.layers.dense(x, units=h, activation=activation)
return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
def get_vars(scope):
return [x for x in tf.global_variables() if scope in x.name]
def count_vars(scope):
v = get_vars(scope)
return sum([np.prod(var.shape.as_list()) for var in v])
def gaussian_likelihood(x, mu, log_std):
pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
return tf.reduce_sum(pre_sum, axis=1)
"""
Policies
"""
LOG_STD_MAX = 2
LOG_STD_MIN = -20
def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
act_dim = a.shape.as_list()[-1]
net = mlp(x, list(hidden_sizes), activation, activation)
mu = tf.layers.dense(net, act_dim, activation=output_activation)
log_std = tf.layers.dense(net, act_dim, activation=None)
log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
std = tf.exp(log_std)
pi = mu + tf.random_normal(tf.shape(mu)) * std
logp_pi = gaussian_likelihood(pi, mu, log_std)
return mu, pi, logp_pi
def apply_squashing_func(mu, pi, logp_pi):
# Adjustment to log prob
# NOTE: This formula is a little bit magic. To get an understanding of where it
# comes from, check out the original SAC paper (arXiv 1801.01290) and look in
# appendix C. This is a more numerically-stable equivalent to Eq 21.
# Try deriving it yourself as a (very difficult) exercise. :)
logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1)
# Squash those unbounded actions!
mu = tf.tanh(mu)
pi = tf.tanh(pi)
return mu, pi, logp_pi
"""
Actor-Critics
"""
def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu,
output_activation=None, policy=mlp_gaussian_policy, action_space=None):
# policy
with tf.variable_scope('pi'):
mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
# make sure actions are in correct range
action_scale = action_space.high[0]
mu *= action_scale
pi *= action_scale
# vfs
vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
with tf.variable_scope('q1'):
q1 = vf_mlp(tf.concat([x,a], axis=-1))
with tf.variable_scope('q2'):
q2 = vf_mlp(tf.concat([x,a], axis=-1))
return mu, pi, logp_pi, q1, q2