in fastchat/llm_judge/common.py [0:0]
def load_pairwise_model_judgments(filename: str):
"""Load model judgments.
The return value is a dict of type:
Dict[judge: Tuple -> Dict[game_key: tuple -> game_result: dict]
"""
judge_dict = {}
for line in open(filename):
obj = json.loads(line)
judge = tuple(obj["judge"])
qid, model_1, model_2 = obj["question_id"], obj["model_1"], obj["model_2"]
if judge not in judge_dict:
judge_dict[judge] = {}
if "winner" in obj:
winner = obj["winner"]
elif "g1_winner" in obj and "g2_winner" in obj:
g1_winner, g2_winner = obj["g1_winner"], obj["g2_winner"]
if g1_winner == g2_winner:
winner = g1_winner
else:
winner = "inconsistent"
else:
raise ValueError(f"Invalid keys: {list(obj.keys())}")
gamekey = (qid, model_1, model_2)
winners = (winner,)
judge_dict[judge][gamekey] = {
"winners": winners,
"g1_judgment": obj["g1_judgment"],
"g2_judgment": obj["g2_judgment"],
}
# Make the model names sorted in the game keys
normalized = {}
for judge, value in judge_dict.items():
normalized[judge] = normalize_game_key_dict(value)
return normalized