def check_same_by_chatgpt()

in lmms_eval/tasks/hallusion_bench/utils.py [0:0]


def check_same_by_chatgpt(data, output_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
    orig_response = {}

    for r in data:
        if str(r["figure_id"]) == "0":
            key = "_".join([r["category"], r["subcategory"], str(r["set_id"]), str(r["question_id"])])
            orig_response[key] = r[output_entry]

    for sample in tqdm(data, desc="Check same by GPT"):
        if "same" not in sample.keys():
            key = "_".join([sample["category"], sample["subcategory"], str(sample["set_id"]), str(sample["question_id"])])
            response2 = orig_response[key]

            prompt = "Imagine you are an intelligent teacher. Thoroughly read the two responses to two different questions. Assess the consistency of the information provided within those two responses. "
            prompt += "You do not know the specific questions, but you can asssess the consistency among the two responses by checking for logical conflicts if both responses are correct. "
            prompt += 'If response1 does not conflict with response2, please generate “same”. Otherwise, generate "different". \n\n response1:'
            prompt += sample[output_entry]
            prompt += "\nresponse2: "
            prompt += response2
            prompt += "\nOutput:"

            # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
            for attempt in range(retries):
                try:
                    headers = {
                        "api-key": API_KEY,
                        "Content-Type": "application/json",
                    }

                    messages = [{"role": "user", "content": prompt}]

                    payload = {
                        "model": gpt_model,
                        "messages": messages,
                        "max_tokens": 16,
                    }
                    response = requests.post(API_URL, headers=headers, json=payload)
                    response.raise_for_status()
                    response = response.json()

                    break
                except Exception as e:
                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
                    if attempt < retries - 1:  # If we have retries left, sleep and then continue to next attempt
                        time.sleep(5)
                    else:  # If this was the last attempt, log and return empty
                        eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")

            try:
                output_text = response["choices"][0]["message"]["content"]
            except Exception as e:
                eval_logger.info(f"Get error {str(e)} when extracting response")
                output_text = "different"

            gpt_same = "0"

            if "same" in output_text.lower():
                gpt_same = "1"

            elif "different" in output_text.lower():
                gpt_same = "0"

            sample["same"] = gpt_same

            with open(save_json_path, "w") as f:
                json.dump(data, f, indent=4)

    return data