in maddpg/trainer/maddpg.py [0:0]
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
with tf.variable_scope(scope, reuse=reuse):
# create distribtuions
act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
# set up placeholders
obs_ph_n = make_obs_ph_n
act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
p_input = obs_ph_n[p_index]
p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
# wrap parameters in distribution
act_pd = act_pdtype_n[p_index].pdfromflat(p)
act_sample = act_pd.sample()
p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
act_input_n = act_ph_n + []
act_input_n[p_index] = act_pd.sample()
q_input = tf.concat(obs_ph_n + act_input_n, 1)
if local_q_func:
q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
pg_loss = -tf.reduce_mean(q)
loss = pg_loss + p_reg * 1e-3
optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
# Create callable functions
train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
p_values = U.function([obs_ph_n[p_index]], p)
# target network
target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}