in mlsh_code/learner.py [0:0]
def __init__(self, env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64):
self.policy = policy
self.clip_param = clip_param
self.entcoeff = entcoeff
self.optim_epochs = optim_epochs
self.optim_stepsize = optim_stepsize
self.optim_batchsize = optim_batchsize
self.num_subpolicies = len(sub_policies)
self.sub_policies = sub_policies
ob_space = env.observation_space
ac_space = env.action_space
# for training theta
# inputs for training theta
ob = U.get_placeholder_cached(name="ob")
ac = policy.pdtype.sample_placeholder([None])
atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param)
self.master_policy_var_list = policy.get_trainable_variables()
self.master_loss = U.function([ob, ac, atarg, ret], U.flatgrad(total_loss, self.master_policy_var_list))
self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)
self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())])
self.assign_subs = []
self.change_subs = []
self.adams = []
self.losses = []
self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None])
for i in range(self.num_subpolicies):
varlist = sub_policies[i].get_trainable_variables()
self.adams.append(MpiAdam(varlist))
# loss for test
loss = self.policy_loss(sub_policies[i], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param)
self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist)))
self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv)
for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]))
self.zerograd = U.function([], self.nograd(varlist))
U.initialize()
self.master_adam.sync()
for i in range(self.num_subpolicies):
self.adams[i].sync()