in ma_policy/ma_policy.py [0:0]
def _init_policy_out(self, pi, taken_actions):
with tf.variable_scope('policy_out'):
self.pdparams = {}
for k in self.pdtypes.keys():
with tf.variable_scope(k):
if self.gaussian_fixed_var and isinstance(self.ac_space.spaces[k], gym.spaces.Box):
mean = tf.layers.dense(pi["main"],
self.pdtypes[k].param_shape()[0] // 2,
kernel_initializer=normc_initializer(0.01),
activation=None)
logstd = tf.get_variable(name="logstd",
shape=[1, self.pdtypes[k].param_shape()[0] // 2],
initializer=tf.zeros_initializer())
self.pdparams[k] = tf.concat([mean, mean * 0.0 + logstd], axis=2)
elif k in pi:
# This is just for the case of entity specific actions
if isinstance(self.ac_space.spaces[k], (gym.spaces.Discrete)):
assert pi[k].get_shape()[-1] == 1
self.pdparams[k] = pi[k][..., 0]
elif isinstance(self.ac_space.spaces[k], (gym.spaces.MultiDiscrete)):
assert np.prod(pi[k].get_shape()[-2:]) == self.pdtypes[k].param_shape()[0],\
f"policy had shape {pi[k].get_shape()} for action {k}, but required {self.pdtypes[k].param_shape()}"
new_shape = shape_list(pi[k])[:-2] + [np.prod(pi[k].get_shape()[-2:]).value]
self.pdparams[k] = tf.reshape(pi[k], shape=new_shape)
else:
assert False
else:
self.pdparams[k] = tf.layers.dense(pi["main"],
self.pdtypes[k].param_shape()[0],
kernel_initializer=normc_initializer(0.01),
activation=None)
with tf.variable_scope('pds'):
self.pds = {k: pdtype.pdfromflat(self.pdparams[k])
for k, pdtype in self.pdtypes.items()}
with tf.variable_scope('sampled_action'):
self.sampled_action = {k: pd.sample() if self.stochastic else pd.mode()
for k, pd in self.pds.items()}
with tf.variable_scope('sampled_action_logp'):
self.sampled_action_logp = sum([self.pds[k].logp(self.sampled_action[k])
for k in self.pdtypes.keys()])
with tf.variable_scope('entropy'):
self.entropy = sum([pd.entropy() for pd in self.pds.values()])
with tf.variable_scope('taken_action_logp'):
self.taken_action_logp = sum([self.pds[k].logp(taken_actions[k])
for k in self.pdtypes.keys()])