in leaderboard/stats.py [0:0]
def mcnemar_test(model_a_array: np.ndarray, model_b_array: np.ndarray):
"""
McNemar's test operates on contingency tables, which we need bo build first.
"""
both_correct = 0
both_wrong = 0
a_correct_b_wrong = 0
a_wrong_b_correct = 0
for a, b in zip(model_a_array, model_b_array):
if a == 1.0 and b == 1.0:
both_correct += 1
elif a == 0.0 and b == 0.0:
both_wrong += 1
elif a == 1.0 and b == 0.0:
a_correct_b_wrong += 1
elif a == 0.0 and b == 1.0:
a_wrong_b_correct += 1
else:
raise ValueError(f"Invalid predictions: {a}, {b}")
contingency_table = np.array(
[[both_correct, a_correct_b_wrong], [a_wrong_b_correct, both_wrong]]
)
return evaluate.mcnemar(ary=contingency_table, corrected=True)