in maddpg/trainer/maddpg.py [0:0]
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
with tf.variable_scope(scope, reuse=reuse):
# create distribtuions
act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
# set up placeholders
obs_ph_n = make_obs_ph_n
act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
target_ph = tf.placeholder(tf.float32, [None], name="target")
q_input = tf.concat(obs_ph_n + act_ph_n, 1)
if local_q_func:
q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
q_loss = tf.reduce_mean(tf.square(q - target_ph))
# viscosity solution to Bellman differential equation in place of an initial condition
q_reg = tf.reduce_mean(tf.square(q))
loss = q_loss #+ 1e-3 * q_reg
optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)
# Create callable functions
train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
q_values = U.function(obs_ph_n + act_ph_n, q)
# target network
target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}