in nevergrad/functions/rl/envs.py [0:0]
def step(self, action_dict: tp.Dict[str, int]) -> base.StepReturn:
"""Returns observations from ready agents.
The returns are dicts mapping from agent_id strings to values. The
number of agents in the env can vary over time.
Returns
-------
obs (dict): New observations for each ready agent.
rewards (dict): Reward values for each ready agent. If the
episode is just started, the value will be None.
dones (dict): Done values for each ready agent. The special key
"__all__" (required) is used to indicate env termination.
infos (dict): Optional info values for each agent id.
"""
if self.verbose:
strings: tp.List[str] = []
for k in range(2):
action = JamesBond.actions[action_dict[f"player_{k}"]]
strings.append(f"Player {k} {self.players[k].get_state()}: {action}")
print(" - ".join(strings))
actions = [JamesBond.actions[action_dict[f"player_{k}"]] for k in range(2)]
self._step += 1
info: tp.Dict[tp.Any, tp.Any] = {}
rew = {"player_0": 0, "player_1": 0}
# change impossible actions
actions = [
"reload" if a == "fire" and not p.ammunitions else a for p, a in zip(self.players, actions)
]
# update players
for player, action in zip(self.players, actions):
player.update_with_action(action)
# main way to win
if actions[0] == "fire" and actions[1] == "reload":
rew = {"player_0": 1, "player_1": 0}
elif actions[0] == "reload" and actions[1] == "fire":
rew = {"player_0": 0, "player_1": 1}
# lose if you keep protecting
if any(p.consecutive_protect > JamesBond.max_consecutive_protect for p in self.players):
if self.players[0].consecutive_protect > self.players[1].consecutive_protect:
rew = {"player_0": 0, "player_1": 1}
elif self.players[1].consecutive_protect > self.players[0].consecutive_protect:
rew = {"player_0": 1, "player_1": 0}
# if both keep protecting... well, it goes on...
obs = self._make_observations()
done = {"__all__": self._step == 100 or sum(abs(x) for x in rew.values()) > 0}
return obs, rew, done, info