in lmms_eval/tasks/hallusion_bench/utils.py [0:0]
def check_same_by_chatgpt(data, output_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
orig_response = {}
for r in data:
if str(r["figure_id"]) == "0":
key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
orig_response[key] = r[output_entry]
for sample in tqdm(data, desc="Check same by GPT"):
if "same" not in sample.keys():
key = "_".join([sample["category"], sample["subcategory"], str(sample["set_id"]), str(sample["question_id"])])
response2 = orig_response[key]
prompt = "Imagine you are an intelligent teacher. Thoroughly read the two responses to two different questions. Assess the consistency of the information provided within those two responses. "
prompt += "You do not know the specific questions, but you can asssess the consistency among the two responses by checking for logical conflicts if both responses are correct. "
prompt += 'If response1 does not conflict with response2, please generate “same”. Otherwise, generate "different". \n\n response1:'
prompt += sample[output_entry]
prompt += "\nresponse2: "
prompt += response2
prompt += "\nOutput:"
# https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
for attempt in range(retries):
try:
headers = {
"api-key": API_KEY,
"Content-Type": "application/json",
}
messages = [{"role": "user", "content": prompt}]
payload = {
"model": gpt_model,
"messages": messages,
"max_tokens": 16,
}
response = requests.post(API_URL, headers=headers, json=payload)
response.raise_for_status()
response = response.json()
break
except Exception as e:
eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
if attempt < retries - 1: # If we have retries left, sleep and then continue to next attempt
time.sleep(5)
else: # If this was the last attempt, log and return empty
eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
try:
output_text = response["choices"][0]["message"]["content"]
except Exception as e:
eval_logger.info(f"Get error {str(e)} when extracting response")
output_text = "different"
gpt_same = "0"
if "same" in output_text.lower():
gpt_same = "1"
elif "different" in output_text.lower():
gpt_same = "0"
sample["same"] = gpt_same
with open(save_json_path, "w") as f:
json.dump(data, f, indent=4)
return data