in src_code/controllers/basic_controller_influence.py [0:0]
def get_alone_q(self, ep_batch, t, test_mode=False, env=None):
agent_inputs = self._build_inputs(ep_batch, t)
agent_alone_inputs = self._build_alone_inputs(ep_batch, t)
avail_actions = ep_batch["avail_actions"][:, t]
agent_outs, self.hidden_states, self.hidden_states_alone = self.agent.forward(agent_inputs, agent_alone_inputs, self.hidden_states, self.hidden_states_alone)
#TODO: need support from sc2
# _, self.target_hidden_states = self.target_agent.get_interactive_q(agent_inputs, self.target_hidden_states)
# agent_inputs_simulated, num_actions = self._build_simulated_states_inputs(ep_batch, t, env)
# agent_outs_interactive, _ = self.target_agent.get_interactive_q(agent_inputs_simulated, self.target_hidden_states.repeat_interleave((self.n_agents-1)*num_actions, 0))
# agent_outs_interactive = th.sum(agent_outs_interactive.reshape(self.n_agents, num_actions, -1), dim=-1)
agent_outs_alone, self.target_hidden_states_alone = self.target_agent.get_alone_q(agent_alone_inputs, self.target_hidden_states_alone)
# Softmax the agent outputs if they're policy logits
if self.agent_output_type == "pi_logits":
if getattr(self.args, "mask_before_softmax", True):
# Make the logits for unavailable actions very negative to minimise their affect on the softmax
reshaped_avail_actions = avail_actions.reshape(ep_batch.batch_size * self.n_agents, -1)
agent_outs_alone[reshaped_avail_actions == 0] = -1e10
agent_outs[reshaped_avail_actions == 0] = -1e10
agent_outs_alone = th.nn.functional.softmax(agent_outs_alone, dim=-1)
agent_outs = th.nn.functional.softmax(agent_outs, dim=-1)
if not test_mode:
# Epsilon floor
epsilon_action_num = agent_outs.size(-1)
if getattr(self.args, "mask_before_softmax", True):
# With probability epsilon, we will pick an available action uniformly
epsilon_action_num = reshaped_avail_actions.sum(dim=1, keepdim=True).float()
agent_outs_alone = ((1 - self.action_selector.epsilon) * agent_outs_alone
+ th.ones_like(agent_outs_alone) * self.action_selector.epsilon/epsilon_action_num)
agent_outs = ((1 - self.action_selector.epsilon) * agent_outs
+ th.ones_like(agent_outs) * self.action_selector.epsilon/epsilon_action_num)
if getattr(self.args, "mask_before_softmax", True):
# Zero out the unavailable actions
agent_outs_alone[reshaped_avail_actions == 0] = 0.0
agent_outs[reshaped_avail_actions == 0] = 0.0
return agent_outs_alone.view(ep_batch.batch_size, self.n_agents, -1), agent_outs.view(ep_batch.batch_size, self.n_agents, -1)