in lmms_eval/tasks/mmvet/utils.py [0:0]
def mmvet_process_results(doc, results):
# get pred and ground truth here
pred = results[0]
question = doc["question"]
answer = doc["answer"]
gpt_query_prompt = f"{MM_VET_PROMPT}\n{question} | {answer.replace('<AND>', ' <AND> ').replace('<OR>', ' <OR> ')} | {pred} |"
grade_sample_run_complete = False
temperature = 0.0
while not grade_sample_run_complete:
content, model_name = get_chat_response(
gpt_query_prompt,
temperature=temperature,
)
if content:
try:
content = content.split(" ")[0].strip()
score = float(content)
if 0.0 <= score <= 1.0:
grade_sample_run_complete = True
except ValueError:
time.sleep(5)
temperature += 0.5
eval_logger.info(f"Sleep 5 secs, {doc['question_id']} try again with increased temperature {temperature}.")
content, model_name = get_chat_response(
gpt_query_prompt,
temperature=temperature,
)
if temperature >= 2: # Assuming a max temperature threshold
score = 0.0
grade_sample_run_complete = True
eval_logger.info(f"Reach to max trials, {doc['question_id']} failed to get a score.")
else:
score = 0.0
grade_sample_run_complete = True
eval_logger.info(f"{doc['question_id']} failed to get a score.")
return {
f"gpt_eval_score": {
"question_id": doc["question_id"],
"question": doc["question"],
"gt_answer": doc["answer"],
"capabilities": doc["capability"],
"pred_answer": pred,
"score": score,
"eval_model": model_name,
}
}