in ma_policy/ma_policy.py [0:0]
def act(self, observation, extra_feed_dict={}):
outputs = {
'ac': self.sampled_action,
'ac_logp': self.sampled_action_logp,
'vpred': self.scaled_value_tensor,
'state': self.state_out}
# Add timestep dimension to observations
obs = deepcopy(observation)
n_agents = observation['observation_self'].shape[0]
# Make sure that there are as many states as there are agents.
# This should only happen with the zero state.
for k, v in self.state.items():
assert v.shape[0] == 1 or v.shape[0] == n_agents
if v.shape[0] == 1 and v.shape[0] != n_agents:
self.state[k] = np.repeat(v, n_agents, 0)
# Add time dimension to obs
for k, v in obs.items():
obs[k] = np.expand_dims(v, 1)
inputs = self.prepare_input(observation=obs, state_in=self.state)
feed_dict = {self.phs[k]: v for k, v in inputs.items()}
feed_dict.update(extra_feed_dict)
outputs = tf.get_default_session().run(outputs, feed_dict)
self.state = outputs['state']
# Remove time dimension from outputs
def preprocess_act_output(act_output):
if isinstance(act_output, dict):
return {k: np.squeeze(v, 1) for k, v in act_output.items()}
else:
return np.squeeze(act_output, 1)
info = {'vpred': preprocess_act_output(outputs['vpred']),
'ac_logp': preprocess_act_output(outputs['ac_logp']),
'state': outputs['state']}
return preprocess_act_output(outputs['ac']), info