def evaluate_by

def evaluate_by_chatgpt()

in lmms_eval/tasks/hallusion_bench/utils.py [0:0]
51 lines of code
13 McCabe index (conditional complexity)

def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4", load_json=False, save_json_path="./hallusion_output.json", retries=3):
    if load_json and os.path.exists(save_json_path):
        with open(save_json_path, "r") as f:
            output = json.load(f)
    else:
        output = []
    for sample in tqdm(data[len(output) :], desc="Eval by GPT"):
        prompt = "Imagine you are an intelligent teacher. Thoroughly read the question, reference answer and the prediction answer to ensure a clear understanding of the information provided. Assess the correctness of the predictions. "
        prompt += 'If the prediction answer does not conflict with the reference answer, please generate “correct”. If the prediction answer conflict with the reference answer, please generate “incorrect”. If the prediction answer is unclear about the answer, please generate "unclear". \n\n Question:'
        prompt += sample["question"]
        prompt += "\nReference answer: "
        prompt += sample["gt_answer_details"]
        prompt += "\nPrediction answer:"
        prompt += sample[output_entry]
        prompt += "\nOutput:"

        # https://github.com/openai/openai-python/issues/322#issuecomment-1767841683
        for attempt in range(retries):
            try:
                messages = [{"role": "user", "content": prompt}]
                payload = {
                    "messages": messages,
                    "max_tokens": 16,
                }
                # set model when using openai api_key. Azure api_key does not need model since the endpoint fixed the model.
                if API_TYPE == "openai":
                    payload["model"] = gpt_model
                response = requests.post(API_URL, headers=headers, json=payload, timeout=30)
                response.raise_for_status()
                response = response.json()
                break
            except Exception as e:
                eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
                if attempt < retries - 1:  # If we have retries left, sleep and then continue to next attempt
                    time.sleep(5)
                else:  # If this was the last attempt, log and return empty
                    eval_logger.error(f"All {retries} attempts failed. Last error message: {str(e)}")
        try:
            output_text = response["choices"][0]["message"]["content"]
        except Exception as e:
            eval_logger.info(f"Get error {str(e)} when extracting response")
            output_text = "unclear"

        if "incorrect" in output_text.lower():
            gpt_correctness = "0"

        elif "correct" in output_text.lower():
            gpt_correctness = "1"
        else:
            gpt_correctness = "2"

        sample[correctness_entry] = gpt_correctness
        sample["gpt_answer"] = prompt + output_text

        output.append(sample)

        with open(save_json_path, "w") as f:
            json.dump(output, f, indent=4)

    return output