in fastchat/llm_judge/common.py [0:0]
def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False):
kwargs = {}
model = judge.model_name
if ref_answer is not None:
kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
if multi_turn:
kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
if multi_turn:
system_prompt = judge.prompt_template["system_prompt"]
user_prompt = judge.prompt_template["prompt_template"].format(
question_1=question["turns"][0],
question_2=question["turns"][1],
answer_a_1=answer_a["choices"][0]["turns"][0],
answer_b_1=answer_b["choices"][0]["turns"][0],
answer_a_2=answer_a["choices"][0]["turns"][1],
answer_b_2=answer_b["choices"][0]["turns"][1],
**kwargs,
)
else:
system_prompt = judge.prompt_template["system_prompt"]
user_prompt = judge.prompt_template["prompt_template"].format(
question=question["turns"][0],
answer_a=answer_a["choices"][0]["turns"][0],
answer_b=answer_b["choices"][0]["turns"][0],
**kwargs,
)
winner = "error"
conv = get_conversation_template(model)
conv.append_message(conv.roles[0], user_prompt)
conv.append_message(conv.roles[1], None)
if model in ["gpt-3.5-turbo", "gpt-4"]:
conv.set_system_message(system_prompt)
judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
elif model in ANTHROPIC_MODEL_LIST:
if system_prompt != "You are a helpful assistant.":
user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
conv.messages[0][1] = user_prompt
judgment = chat_compeletion_anthropic(
model, conv, temperature=0, max_tokens=1024
)
else:
raise ValueError(f"Invalid judge model name: {model}")
if judge.prompt_template["output_format"] == "[[A]]":
if "[[A]]" in judgment:
winner = "A"
elif "[[B]]" in judgment:
winner = "B"
elif "[[C]]" in judgment:
winner = "tie"
else:
winner = "error"
elif judge.prompt_template["output_format"] == "[[rating_a,rating_b]]":
match = re.search(two_score_pattern, judgment)
if not match:
match = re.search(two_score_pattern_backup, judgment)
if match:
scores = [ast.literal_eval(s.strip()) for s in match.groups()]
if abs(scores[0] - scores[1]) <= TIE_DELTA:
winner = "tie"
elif scores[0] > scores[1]:
winner = "A"
else:
winner = "B"
else:
winner = "error"
else:
raise ValueError(
f"invalid output format: {judge.prompt_template['output_format']}"
)
return winner, user_prompt, judgment