in randomized_uncertain_social_preferences/rusp/env_prisoners_buddy.py [0:0]
def step(self, action):
obs, rew, done, info = self.env.step(action)
self._chose_me = np.zeros((self.n_agents, self.n_agents), dtype=bool)
targets = np.ones(self.n_agents, dtype=int) * -1
for i in range(self.n_agents):
target = self._get_target_actor(i, action)
if len(target):
targets[i] = target[0]
self._chose_me[target[0], i] = 1
self._previous_choice_identity = obs['agent_identity'][targets]
self._previous_choice_identity[targets == -1] = 0
# Reward rounds
if self._t % self.choosing_period == 0:
self._both_chose = self._chose_me * self._chose_me.T
self._chose_me_rew = self._chose_me.copy()
self._teams = np.argmax(self._both_chose, axis=1) # Indicies of teamate
self._teams[np.all(self._both_chose == 0, axis=1)] = -1 # Make sure those without team are set to -1 instead of 0
rew = self._prisoners_buddy_reward_update(rew)
# Track stats
self._n_times_not_chosen[np.sum(self._chose_me, 1) == 0] += 1
# Since both_chose is symmetric, just get the index of nonzero entry in upper triangle
current_team_indices = np.c_[np.nonzero(np.triu(self._both_chose))]
current_team_tuples = list(map(tuple, current_team_indices))
teams_done = [k for k in self._current_team_lengths.keys() if k not in current_team_tuples]
for team_done in teams_done:
self._team_lengths.append(self._current_team_lengths[team_done])
del self._current_team_lengths[team_done]
for current_team_tuple in current_team_tuples:
self._current_team_lengths[current_team_tuple] += 1
self._i_chose_any_rew_obs = np.any(self._chose_me_rew, 0)[:, None]
if self._first_choice:
self._first_choice = False
else:
all_teams_didnt_change = np.all(self._previous_teams == self._teams)
max_number_of_teams_filled = np.sum(self._teams != -1) == ((self.n_agents // 2) * 2)
self._perfect_game = self._perfect_game and all_teams_didnt_change and max_number_of_teams_filled
self._previous_teams = self._teams
self._t += 1
if done:
self._team_lengths += list(self._current_team_lengths.values())
info['average_team_length'] = np.mean(self._team_lengths) if len(self._team_lengths) else 0
info['n_times_team_changed'] = np.sum(self._n_times_team_changed)
info['n_agents_on_team_per_step'] = np.mean(self._n_agents_on_team)
info['number_decisions'] = self._t / self.choosing_period
info['n_unique_not_chosen'] = np.sum(self._n_times_not_chosen > 0)
info['n_successful_defections'] = self._n_successful_defections
info['perfect_game'] = self._perfect_game
return self.observation(obs), rew, done, info