in src/lighteval/tasks/extended/hle/main.py [0:0]
def calib_err(confidence, correct, p="2", beta=100):
# beta is target bin size
idxs = np.argsort(confidence)
confidence = confidence[idxs]
correct = correct[idxs]
bins = [[i * beta, (i + 1) * beta] for i in range(len(confidence) // beta)]
if len(bins) == 0:
logger.warning("Error when computing the bins for calibration error")
return -1
bins[-1] = [bins[-1][0], len(confidence)]
cerr = 0
total_examples = len(confidence)
for i in range(len(bins) - 1):
bin_confidence = confidence[bins[i][0] : bins[i][1]]
bin_correct = correct[bins[i][0] : bins[i][1]]
num_examples_in_bin = len(bin_confidence)
if num_examples_in_bin > 0:
difference = np.abs(np.nanmean(bin_confidence) - np.nanmean(bin_correct))
if p == "2":
cerr += num_examples_in_bin / total_examples * np.square(difference)
elif p == "1":
cerr += num_examples_in_bin / total_examples * difference
elif p == "infty" or p == "infinity" or p == "max":
cerr = np.maximum(cerr, difference)
else:
assert False, "p must be '1', '2', or 'infty'"
if p == "2":
cerr = np.sqrt(cerr)
return cerr