in randomized_uncertain_social_preferences/rusp/env_indirect_reciprocity.py [0:0]
def step(self, action):
obs, _, done, info = self.env.step(action)
# Update statistics for agents that are playing (p1 and p2)
p1, p2 = np.where(self._youre_playing)[0]
self.num_defects[p1, p2] += action['action_defect'][p1]
self.num_defects[p2, p1] += action['action_defect'][p2]
self.num_coops[p1, p2] += 1 - action['action_defect'][p1]
self.num_coops[p2, p1] += 1 - action['action_defect'][p2]
if p1 == self.n_agents - 1 or p2 == self.n_agents - 1 or self.last_started:
self.last_started = True
self.num_defects_sls[p1, p2] += action['action_defect'][p1]
self.num_defects_sls[p2, p1] += action['action_defect'][p2]
self.num_coops_sls[p1, p2] += 1 - action['action_defect'][p1]
self.num_coops_sls[p2, p1] += 1 - action['action_defect'][p2]
self.previous_action = action['action_defect'].copy()
self.previous_action[~self._youre_playing] = -1 # if you weren't playing don't give info on what you chose
self.previous_action_while_playing[self._youre_playing] = action['action_defect'][self._youre_playing].copy()
rew = np.zeros(self.n_agents)
rew[[p1, p2]] = self.payoff_matrix[action['action_defect'][p1], action['action_defect'][p2]]
assert np.all(rew[~self._youre_playing] == 0)
# Calling step will update the next players, so update this after computing reward.
self._youre_playing = np.squeeze(obs['youre_playing'].copy(), -1)
if done:
info.update({f'actor{i}_n_coops': n_coops for i, n_coops in enumerate(np.sum(self.num_coops, 1))})
info.update({f'actor{i}_n_defects': n_defects for i, n_defects in enumerate(np.sum(self.num_defects, 1))})
# Compute fraction of actions that were defects against
# (a) all agents as compared to the last agent, i.e. the difference in these fractions
# (b) the last agent
# We compute these statistics because for evaluation the last agent may be scripted, and so they are useful
# in comparing e.g. the difference in fraction of defects against an all-defect versus all-cooperate policy
num_actions = self.num_coops + self.num_defects
frac_defects_against_each_other = np.sum(self.num_defects[:-1, :-1]) / np.sum(num_actions[:-1, :-1])
frac_defects_against_last = np.sum(self.num_defects[:-1, -1]) / np.sum(num_actions[:-1, -1])
info.update({'frac_defects_all_minus_last': frac_defects_against_each_other})
info.update({'frac_defects_against_last': frac_defects_against_last})
# In the Prior Rapport setting (see paper), we want to measure the fraction of defects against the last agent
# AFTER the last agent has started acting, which is after the period in which the other agents have been able to
# gain rapport.
num_actions_sls = self.num_coops_sls + self.num_defects_sls
frac_defects_against_each_other_sls = np.sum(self.num_defects_sls[:-1, :-1]) / np.sum(num_actions_sls[:-1, :-1])
frac_defects_against_last_sls = np.sum(self.num_defects_sls[:-1, -1]) / np.sum(num_actions_sls[:-1, -1])
if not (np.isnan(frac_defects_against_each_other_sls) or np.isnan(frac_defects_against_last_sls)):
info.update({'frac_defects_all_minus_last_sls': frac_defects_against_each_other_sls})
info.update({'frac_defects_against_last_sls': frac_defects_against_last_sls})
return self.observation(obs), rew, done, info