in src_code/controllers/basic_controller_interactive.py [0:0]
def forward(self, ep_batch, t, test_mode=False):
agent_inputs = self._build_inputs(ep_batch, t)
agent_alone_inputs = self._build_alone_inputs(ep_batch, t)
avail_actions = ep_batch["avail_actions"][:, t]
agent_outs, self.hidden_states, self.hidden_states_alone = self.agent(agent_inputs, agent_alone_inputs, self.hidden_states, self.hidden_states_alone)
# Softmax the agent outputs if they're policy logits
if self.agent_output_type == "pi_logits":
if getattr(self.args, "mask_before_softmax", True):
# Make the logits for unavailable actions very negative to minimise their affect on the softmax
reshaped_avail_actions = avail_actions.reshape(ep_batch.batch_size * self.n_agents, -1)
agent_outs[reshaped_avail_actions == 0] = -1e10
agent_outs = th.nn.functional.softmax(agent_outs, dim=-1)
if not test_mode:
# Epsilon floor
epsilon_action_num = agent_outs.size(-1)
if getattr(self.args, "mask_before_softmax", True):
# With probability epsilon, we will pick an available action uniformly
epsilon_action_num = reshaped_avail_actions.sum(dim=1, keepdim=True).float()
agent_outs = ((1 - self.action_selector.epsilon) * agent_outs
+ th.ones_like(agent_outs) * self.action_selector.epsilon/epsilon_action_num)
if getattr(self.args, "mask_before_softmax", True):
# Zero out the unavailable actions
agent_outs[reshaped_avail_actions == 0] = 0.0
return agent_outs.view(ep_batch.batch_size, self.n_agents, -1)