in fastchat/llm_judge/compute_agreement.py [0:0]
def get_mt_bench_agreement(data, judge1, judge2, ban):
if judge1.startswith("gpt4") and judge2 == "human":
stats = [0, 0]
for votes in data.values():
if judge1 not in votes or judge2 not in votes:
continue
assert len(votes[judge1]) == 1
if convertvote(votes[judge1][0]) in ban:
continue
for v in votes[judge2]:
if convertvote(v) in ban:
continue
stats[1] += 1
stats[0] += equalvote(votes[judge1][0], v)
return stats[0], stats[1]
elif judge1 == "human" and judge2 == "human":
stats = [0, 0]
for votes in data.values():
if "human" not in votes:
continue
for i in range(len(votes["human"]) - 1):
for j in range(i + 1, len(votes["human"])):
if (
convertvote(votes["human"][i]) in ban
or convertvote(votes["human"][j]) in ban
):
continue
stats[1] += 1
stats[0] += equalvote(votes["human"][i], votes["human"][j])
return stats[0], stats[1]
else:
raise Exception("Unsupported judges.")