in leaderboard/plots.py [0:0]
def rank_compute_bootstrap_ci(data_path: str, n_trials: int = 1000, fold: str = "dev"):
"""Given stability experiment, compute bootstrapped
confidence intervals, and check if correlations are above 95%
interval.
Args:
data_path (str): Path to dataframe stored in feather format with experiment
"""
df = pd.read_feather(data_path)
size = df["size"].iloc[0]
trial_id = df["trial_id"].iloc[0]
if fold == "test":
get_test_irt, get_test_classical = load_test_irt()
df["b_irt"] = df["subject_id"].map(get_test_irt)
df["b_classical"] = df["subject_id"].map(get_test_classical)
df = df.dropna(0)
real_corr = df.corr(method="kendall")
# Due to not implementing identifiability, IRT scores may be flipped
# Detect that and adjust as necessary
if real_corr["a_irt"].a_classical < 0:
df["a_irt"] = -df["a_irt"]
if real_corr["b_irt"].b_classical < 0:
df["b_irt"] = -df["b_irt"]
real_corr = df.corr(method="kendall")
corr_diff = real_corr["a_irt"].b_irt - real_corr["a_classical"].b_classical
a_classical_scores = df.a_classical.to_numpy()
a_irt_scores = df.a_irt.to_numpy()
indices = np.arange(0, len(a_classical_scores))
# Build up a distribution of score differences
diff_dist = []
# Simulate a bunch of times
n_subjects = len(a_classical_scores)
for _ in range(n_trials):
# Create a new similar DF, except sample with replacement one set of rankings
# Be sure to keep pairs of irt/classical scores together
sample_indices = np.random.choice(indices, n_subjects, replace=True)
sample_classical = a_classical_scores[sample_indices]
sample_irt = a_irt_scores[sample_indices]
sample_df = pd.DataFrame(
{
"subject_id": df["subject_id"],
# I'm not sure doing replacement is correct
# Also not sure if n=161 is correct, seems odd,
# but I'd be worried if I did only 20 that
# the distribution of differences might be different
"a_classical": sample_classical,
"a_irt": sample_irt,
# Keep one ranking the same
"b_classical": df["b_classical"],
"b_irt": df["b_irt"],
}
)
sample_corr = sample_df.corr(method="kendall")
# Grab correlations
irt_corr = sample_corr.loc["a_irt"].b_irt
classical_corr = sample_corr.loc["a_classical"].b_classical
# Record the difference
diff_dist.append(irt_corr - classical_corr)
diff_df = pd.DataFrame({"diff": diff_dist})
# Two tailed test, so divide by two
alpha = 1 - 0.95
lower, upper = diff_df["diff"].quantile([alpha, 1 - alpha])
# significant = bool(corr_diff < lower or upper < corr_diff)
significant = bool(upper < corr_diff)
p_value = 1 - ((diff_df["diff"] < corr_diff).sum() / n_trials)
return {
"significant": significant,
"p_value": float(p_value),
"diff": float(corr_diff),
"irt_corr": float(real_corr["a_irt"].b_irt),
"classical_corr": float(real_corr["a_classical"].b_classical),
"trial_size": int(size),
"trial_id": int(trial_id),
"lower": float(lower),
"upper": float(upper),
"alpha": alpha,
"diff_dist": diff_dist,
}