def rank_compute_bootstrap_ci()

in leaderboard/plots.py [0:0]


def rank_compute_bootstrap_ci(data_path: str, n_trials: int = 1000, fold: str = "dev"):
    """Given stability experiment, compute bootstrapped
    confidence intervals, and check if correlations are above 95%
    interval.

    Args:
        data_path (str): Path to dataframe stored in feather format with experiment
    """
    df = pd.read_feather(data_path)
    size = df["size"].iloc[0]
    trial_id = df["trial_id"].iloc[0]
    if fold == "test":
        get_test_irt, get_test_classical = load_test_irt()
        df["b_irt"] = df["subject_id"].map(get_test_irt)
        df["b_classical"] = df["subject_id"].map(get_test_classical)
        df = df.dropna(0)

    real_corr = df.corr(method="kendall")

    # Due to not implementing identifiability, IRT scores may be flipped
    # Detect that and adjust as necessary
    if real_corr["a_irt"].a_classical < 0:
        df["a_irt"] = -df["a_irt"]

    if real_corr["b_irt"].b_classical < 0:
        df["b_irt"] = -df["b_irt"]

    real_corr = df.corr(method="kendall")

    corr_diff = real_corr["a_irt"].b_irt - real_corr["a_classical"].b_classical
    a_classical_scores = df.a_classical.to_numpy()
    a_irt_scores = df.a_irt.to_numpy()
    indices = np.arange(0, len(a_classical_scores))
    # Build up a distribution of score differences
    diff_dist = []
    # Simulate a bunch of times
    n_subjects = len(a_classical_scores)

    for _ in range(n_trials):
        # Create a new similar DF, except sample with replacement one set of rankings
        # Be sure to keep pairs of irt/classical scores together
        sample_indices = np.random.choice(indices, n_subjects, replace=True)
        sample_classical = a_classical_scores[sample_indices]
        sample_irt = a_irt_scores[sample_indices]
        sample_df = pd.DataFrame(
            {
                "subject_id": df["subject_id"],
                # I'm not sure doing replacement is correct
                # Also not sure if n=161 is correct, seems odd,
                # but I'd be worried if I did only 20 that
                # the distribution of differences might be different
                "a_classical": sample_classical,
                "a_irt": sample_irt,
                # Keep one ranking the same
                "b_classical": df["b_classical"],
                "b_irt": df["b_irt"],
            }
        )
        sample_corr = sample_df.corr(method="kendall")

        # Grab correlations
        irt_corr = sample_corr.loc["a_irt"].b_irt
        classical_corr = sample_corr.loc["a_classical"].b_classical

        # Record the difference
        diff_dist.append(irt_corr - classical_corr)
    diff_df = pd.DataFrame({"diff": diff_dist})
    # Two tailed test, so divide by two
    alpha = 1 - 0.95

    lower, upper = diff_df["diff"].quantile([alpha, 1 - alpha])
    # significant = bool(corr_diff < lower or upper < corr_diff)
    significant = bool(upper < corr_diff)
    p_value = 1 - ((diff_df["diff"] < corr_diff).sum() / n_trials)
    return {
        "significant": significant,
        "p_value": float(p_value),
        "diff": float(corr_diff),
        "irt_corr": float(real_corr["a_irt"].b_irt),
        "classical_corr": float(real_corr["a_classical"].b_classical),
        "trial_size": int(size),
        "trial_id": int(trial_id),
        "lower": float(lower),
        "upper": float(upper),
        "alpha": alpha,
        "diff_dist": diff_dist,
    }