in leaderboard/power.py [0:0]
def simulate(p_correct: float, n_trials: int, n_points: int, delta: float, alpha: float = 0.05):
all_agreements = []
power = []
antipower = []
for _ in range(n_trials):
baseline_p_correct = p_correct
other_p_correct = baseline_p_correct + delta
contingency_table = np.zeros((2, 2))
n_baseline_correct = 0
n_other_correct = 0
# Get predictions
for _ in range(n_points):
if np.random.random() < baseline_p_correct:
baseline_table = 0
n_baseline_correct += 1
else:
baseline_table = 1
if np.random.random() < other_p_correct:
other_table = 0
n_other_correct += 1
else:
other_table = 1
contingency_table[baseline_table][other_table] += 1
agreement = contingency_table[0][0] + contingency_table[1][1]
all_agreements.append(agreement / n_points)
_, p = evaluate.mcnemar(ary=contingency_table)
baseline_accuracy = n_baseline_correct / n_points
other_accuracy = n_other_correct / n_points
observed_effect = other_accuracy - baseline_accuracy
if observed_effect > 0 and p <= alpha:
power.append(1)
else:
power.append(0)
if observed_effect <= 0 and p >= alpha:
antipower.append(1)
else:
antipower.append(0)
avg_power = np.mean(power)
avg_antipower = np.mean(antipower)
return {
"delta": delta,
"power": avg_power,
"antipower": avg_antipower,
"agreement": np.mean(all_agreements),
"p_correct": p_correct,
}