def step()

in randomized_uncertain_social_preferences/rusp/env_indirect_reciprocity.py [0:0]


    def step(self, action):
        obs, _, done, info = self.env.step(action)

        # Update statistics for agents that are playing (p1 and p2)
        p1, p2 = np.where(self._youre_playing)[0]
        self.num_defects[p1, p2] += action['action_defect'][p1]
        self.num_defects[p2, p1] += action['action_defect'][p2]
        self.num_coops[p1, p2] += 1 - action['action_defect'][p1]
        self.num_coops[p2, p1] += 1 - action['action_defect'][p2]
        if p1 == self.n_agents - 1 or p2 == self.n_agents - 1 or self.last_started:
            self.last_started = True
            self.num_defects_sls[p1, p2] += action['action_defect'][p1]
            self.num_defects_sls[p2, p1] += action['action_defect'][p2]
            self.num_coops_sls[p1, p2] += 1 - action['action_defect'][p1]
            self.num_coops_sls[p2, p1] += 1 - action['action_defect'][p2]

        self.previous_action = action['action_defect'].copy()
        self.previous_action[~self._youre_playing] = -1  # if you weren't playing don't give info on what you chose
        self.previous_action_while_playing[self._youre_playing] = action['action_defect'][self._youre_playing].copy()

        rew = np.zeros(self.n_agents)
        rew[[p1, p2]] = self.payoff_matrix[action['action_defect'][p1], action['action_defect'][p2]]
        assert np.all(rew[~self._youre_playing] == 0)

        # Calling step will update the next players, so update this after computing reward.
        self._youre_playing = np.squeeze(obs['youre_playing'].copy(), -1)

        if done:
            info.update({f'actor{i}_n_coops': n_coops for i, n_coops in enumerate(np.sum(self.num_coops, 1))})
            info.update({f'actor{i}_n_defects': n_defects for i, n_defects in enumerate(np.sum(self.num_defects, 1))})

            # Compute fraction of actions that were defects against
            #    (a) all agents as compared to the last agent, i.e. the difference in these fractions
            #    (b) the last agent
            # We compute these statistics because for evaluation the last agent may be scripted, and so they are useful
            #   in comparing e.g. the difference in fraction of defects against an all-defect versus all-cooperate policy
            num_actions = self.num_coops + self.num_defects
            frac_defects_against_each_other = np.sum(self.num_defects[:-1, :-1]) / np.sum(num_actions[:-1, :-1])
            frac_defects_against_last = np.sum(self.num_defects[:-1, -1]) / np.sum(num_actions[:-1, -1])
            info.update({'frac_defects_all_minus_last': frac_defects_against_each_other})
            info.update({'frac_defects_against_last': frac_defects_against_last})

            # In the Prior Rapport setting (see paper), we want to measure the fraction of defects against the last agent
            #   AFTER the last agent has started acting, which is after the period in which the other agents have been able to
            #   gain rapport.
            num_actions_sls = self.num_coops_sls + self.num_defects_sls
            frac_defects_against_each_other_sls = np.sum(self.num_defects_sls[:-1, :-1]) / np.sum(num_actions_sls[:-1, :-1])
            frac_defects_against_last_sls = np.sum(self.num_defects_sls[:-1, -1]) / np.sum(num_actions_sls[:-1, -1])
            if not (np.isnan(frac_defects_against_each_other_sls) or np.isnan(frac_defects_against_last_sls)):
                info.update({'frac_defects_all_minus_last_sls': frac_defects_against_each_other_sls})
                info.update({'frac_defects_against_last_sls': frac_defects_against_last_sls})
        return self.observation(obs), rew, done, info