in lmms_eval/tasks/hallusion_bench/utils.py [0:0]
def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
if load_json and os.path.exists(save_json_path):
with open(save_json_path, "r") as f:
output = json.load(f)
else:
output = []
for sample in tqdm(data[len(output) :], desc="Eval by GPT"):
prompt = "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. "
prompt += 'If the prediction answer does not conflict with the reference answer, please generate “correct”. If the prediction answer conflict with the reference answer, please generate “incorrect”. If the prediction answer is unclear about the answer, please generate "unclear". \n\n Question:'
prompt += sample["question"]
prompt += "\nReference answer: "
prompt += sample["gt_answer_details"]
prompt += "\nPrediction answer:"
prompt += sample[output_entry]
prompt += "\nOutput:"
# https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
for attempt in range(retries):
try:
messages = [{"role": "user", "content": prompt}]
payload = {
"messages": messages,
"max_tokens": 16,
}
# set model when using openai api_key. Azure api_key does not need model since the endpoint fixed the model.
if API_TYPE == "openai":
payload["model"] = gpt_model
response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
response.raise_for_status()
response = response.json()
break
except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
time.sleep(5)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
try:
output_text = response["choices"][0]["message"]["content"]
except Exception as e:
eval_logger.info(f"Get error {str(e)} when extracting response")
output_text = "unclear"
if "incorrect" in output_text.lower():
gpt_correctness = "0"
elif "correct" in output_text.lower():
gpt_correctness = "1"
else:
gpt_correctness = "2"
sample[correctness_entry] = gpt_correctness
sample["gpt_answer"] = prompt + output_text
output.append(sample)
with open(save_json_path, "w") as f:
json.dump(output, f, indent=4)
return output