in fastchat/llm_judge/common.py [0:0]
def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
kwargs = {}
model = judge.model_name
if ref_answer is not None:
kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
if multi_turn:
kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
if multi_turn:
user_prompt = judge.prompt_template["prompt_template"].format(
question_1=question["turns"][0],
question_2=question["turns"][1],
answer_1=answer["choices"][0]["turns"][0],
answer_2=answer["choices"][0]["turns"][1],
**kwargs,
)
else:
user_prompt = judge.prompt_template["prompt_template"].format(
question=question["turns"][0],
answer=answer["choices"][0]["turns"][0],
**kwargs,
)
rating = -1
system_prompt = judge.prompt_template["system_prompt"]
conv = get_conversation_template(model)
conv.set_system_message(system_prompt)
conv.append_message(conv.roles[0], user_prompt)
conv.append_message(conv.roles[1], None)
if model in ["gpt-3.5-turbo", "gpt-4"]:
judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
elif model in ANTHROPIC_MODEL_LIST:
judgment = chat_compeletion_anthropic(
model, conv, temperature=0, max_tokens=1024
)
else:
raise ValueError(f"Invalid judge model name: {model}")
if judge.prompt_template["output_format"] == "[[rating]]":
match = re.search(one_score_pattern, judgment)
if not match:
match = re.search(one_score_pattern_backup, judgment)
if match:
rating = ast.literal_eval(match.groups()[0])
else:
rating = -1
else:
raise ValueError(
f"invalid output format: {judge.prompt_template['output_format']}"
)
return rating, user_prompt, judgment