in baselines/deepq/build_graph.py [0:0]
def build_act_with_param_noise(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, param_noise_filter_func=None):
"""Creates the act function with support for parameter space noise exploration (https://arxiv.org/abs/1706.01905):
Parameters
----------
make_obs_ph: str -> tf.placeholder or TfInput
a function that take a name and creates a placeholder of input with that name
q_func: (tf.Variable, int, str, bool) -> tf.Variable
the model that takes the following inputs:
observation_in: object
the output of observation placeholder
num_actions: int
number of actions
scope: str
reuse: bool
should be passed to outer variable scope
and returns a tensor of shape (batch_size, num_actions) with values of every action.
num_actions: int
number of actions.
scope: str or VariableScope
optional scope for variable_scope.
reuse: bool or None
whether or not the variables should be reused. To be able to reuse the scope must be given.
param_noise_filter_func: tf.Variable -> bool
function that decides whether or not a variable should be perturbed. Only applicable
if param_noise is True. If set to None, default_param_noise_filter is used by default.
Returns
-------
act: (tf.Variable, bool, float, bool, float, bool) -> tf.Variable
function to select and action given observation.
` See the top of the file for details.
"""
if param_noise_filter_func is None:
param_noise_filter_func = default_param_noise_filter
with tf.variable_scope(scope, reuse=reuse):
observations_ph = make_obs_ph("observation")
stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")
update_param_noise_threshold_ph = tf.placeholder(tf.float32, (), name="update_param_noise_threshold")
update_param_noise_scale_ph = tf.placeholder(tf.bool, (), name="update_param_noise_scale")
reset_ph = tf.placeholder(tf.bool, (), name="reset")
eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
param_noise_scale = tf.get_variable("param_noise_scale", (), initializer=tf.constant_initializer(0.01), trainable=False)
param_noise_threshold = tf.get_variable("param_noise_threshold", (), initializer=tf.constant_initializer(0.05), trainable=False)
# Unmodified Q.
q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
# Perturbable Q used for the actual rollout.
q_values_perturbed = q_func(observations_ph.get(), num_actions, scope="perturbed_q_func")
# We have to wrap this code into a function due to the way tf.cond() works. See
# https://stackoverflow.com/questions/37063952/confused-by-the-behavior-of-tf-cond for
# a more detailed discussion.
def perturb_vars(original_scope, perturbed_scope):
all_vars = scope_vars(absolute_scope_name(original_scope))
all_perturbed_vars = scope_vars(absolute_scope_name(perturbed_scope))
assert len(all_vars) == len(all_perturbed_vars)
perturb_ops = []
for var, perturbed_var in zip(all_vars, all_perturbed_vars):
if param_noise_filter_func(perturbed_var):
# Perturb this variable.
op = tf.assign(perturbed_var, var + tf.random_normal(shape=tf.shape(var), mean=0., stddev=param_noise_scale))
else:
# Do not perturb, just assign.
op = tf.assign(perturbed_var, var)
perturb_ops.append(op)
assert len(perturb_ops) == len(all_vars)
return tf.group(*perturb_ops)
# Set up functionality to re-compute `param_noise_scale`. This perturbs yet another copy
# of the network and measures the effect of that perturbation in action space. If the perturbation
# is too big, reduce scale of perturbation, otherwise increase.
q_values_adaptive = q_func(observations_ph.get(), num_actions, scope="adaptive_q_func")
perturb_for_adaption = perturb_vars(original_scope="q_func", perturbed_scope="adaptive_q_func")
kl = tf.reduce_sum(tf.nn.softmax(q_values) * (tf.log(tf.nn.softmax(q_values)) - tf.log(tf.nn.softmax(q_values_adaptive))), axis=-1)
mean_kl = tf.reduce_mean(kl)
def update_scale():
with tf.control_dependencies([perturb_for_adaption]):
update_scale_expr = tf.cond(mean_kl < param_noise_threshold,
lambda: param_noise_scale.assign(param_noise_scale * 1.01),
lambda: param_noise_scale.assign(param_noise_scale / 1.01),
)
return update_scale_expr
# Functionality to update the threshold for parameter space noise.
update_param_noise_threshold_expr = param_noise_threshold.assign(tf.cond(update_param_noise_threshold_ph >= 0,
lambda: update_param_noise_threshold_ph, lambda: param_noise_threshold))
# Put everything together.
deterministic_actions = tf.argmax(q_values_perturbed, axis=1)
batch_size = tf.shape(observations_ph.get())[0]
random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)
output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
updates = [
update_eps_expr,
tf.cond(reset_ph, lambda: perturb_vars(original_scope="q_func", perturbed_scope="perturbed_q_func"), lambda: tf.group(*[])),
tf.cond(update_param_noise_scale_ph, lambda: update_scale(), lambda: tf.Variable(0., trainable=False)),
update_param_noise_threshold_expr,
]
_act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph, reset_ph, update_param_noise_threshold_ph, update_param_noise_scale_ph],
outputs=output_actions,
givens={update_eps_ph: -1.0, stochastic_ph: True, reset_ph: False, update_param_noise_threshold_ph: False, update_param_noise_scale_ph: False},
updates=updates)
def act(ob, reset=False, update_param_noise_threshold=False, update_param_noise_scale=False, stochastic=True, update_eps=-1):
return _act(ob, stochastic, update_eps, reset, update_param_noise_threshold, update_param_noise_scale)
return act