def estimate()

in reagent/evaluation/sequential_doubly_robust_estimator.py [0:0]


    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(
            edp.model_propensities * edp.model_values, dim=1
        )

        estimated_q_values_for_logged_action = torch.sum(
            edp.model_values * edp.action_mask, dim=1
        )

        target_propensity_for_action = torch.sum(
            edp.model_propensities * edp.action_mask, dim=1
        )

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: "
            + str(target_propensity_for_action.shape)
            + " != "
            + str(logged_propensities.shape)
        )
        assert (
            target_propensity_for_action.shape
            == estimated_q_values_for_logged_action.shape
        ), (
            "Invalid shape: "
            + str(target_propensity_for_action.shape)
            + " != "
            + str(estimated_q_values_for_logged_action.shape)
        )
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: "
            + str(target_propensity_for_action.shape)
            + " != "
            + str(logged_rewards.shape)
        )
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        assert edp.mdp_id is not None
        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            # pyre-ignore [16]: Optional type has no attribute `__getitem__`
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[j] + importance_weight[j] * (
                        logged_rewards[j]
                        + self.gamma * doubly_robust
                        - estimated_q_values_for_logged_action[j]
                    )
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]

                doubly_robusts.append(float(doubly_robust))
                episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        if len(doubly_robusts) == 0:
            torch.set_printoptions(profile="full")
            zipped_data = list(
                zip(
                    *map(
                        lambda x: x.tolist(),
                        [
                            edp.mdp_id,
                            logged_rewards,
                            estimated_state_values,
                            estimated_q_values_for_logged_action,
                            importance_weight,
                        ],
                    )
                )
            )
            raise RuntimeError(
                f"No valid doubly robusts data is generated.\n"
                f"mdp_ids x logged_rewards x estimated_state_values x "
                f"estimated_q_values_for_logged_action x importance_weight:\n"
                f"{zipped_data};\n"
                f"gamma={self.gamma};\n"
                f"Did you specify wrong metric names?"
            )

        # pyre-fixme[9]: doubly_robusts has type `List[float]`; used as `ndarray`.
        doubly_robusts = np.array(doubly_robusts)
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        # pyre-fixme[9]: episode_values has type `List[float]`; used as `ndarray`.
        episode_values = np.array(episode_values)
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small"
                f" or negative logged_policy_score ({logged_policy_score})."
                f"Episode values: {episode_values}."
            )
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )