spinup/algos/tf1/sac/core.py (50 lines of code) (raw):

import numpy as np import tensorflow as tf EPS = 1e-8 def placeholder(dim=None): return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) def placeholders(*args): return [placeholder(dim) for dim in args] def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) def get_vars(scope): return [x for x in tf.global_variables() if scope in x.name] def count_vars(scope): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) def gaussian_likelihood(x, mu, log_std): pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) return tf.reduce_sum(pre_sum, axis=1) """ Policies """ LOG_STD_MAX = 2 LOG_STD_MIN = -20 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) log_std = tf.layers.dense(net, act_dim, activation=None) log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): # Adjustment to log prob # NOTE: This formula is a little bit magic. To get an understanding of where it # comes from, check out the original SAC paper (arXiv 1801.01290) and look in # appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :) logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1) # Squash those unbounded actions! mu = tf.tanh(mu) pi = tf.tanh(pi) return mu, pi, logp_pi """ Actor-Critics """ def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, output_activation=None, policy=mlp_gaussian_policy, action_space=None): # policy with tf.variable_scope('pi'): mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) # make sure actions are in correct range action_scale = action_space.high[0] mu *= action_scale pi *= action_scale # vfs vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) with tf.variable_scope('q1'): q1 = vf_mlp(tf.concat([x,a], axis=-1)) with tf.variable_scope('q2'): q2 = vf_mlp(tf.concat([x,a], axis=-1)) return mu, pi, logp_pi, q1, q2