def eval_result()

in lmms_eval/tasks/mmbench/mmbench_evals.py [0:0]


    def eval_result(self, results, eval_method):
        rd.seed(2680)
        assert eval_method == "openai"
        # Set a large retry number to avoid failure
        # model = OpenAI('gpt-3.5-turbo-0613', retry=99)

        # double_log(f'Evaluating {eval_file}', fout)

        # result_file = eval_file.replace('.xlsx', f'_{eval_method}_result.pkl')
        result = {}
        # if osp.exists(result_file):
        #     result = load(result_file)

        # data = load(eval_file)
        data = pd.DataFrame(results)
        data = data.sort_values(by="index")
        data["prediction"] = [str(x) for x in data["prediction"]]
        for k in data.keys():
            data[k.lower() if k not in "ABCD" else k] = data.pop(k)

        # meta = load(meta_file)

        data_main = data[data["index"] < int(1e6)]

        data_main["hit"] = 0
        cate_map = {i: c for i, c in zip(data["index"], data["category"])}
        answer_map = {i: c for i, c in zip(data["index"], data["answer"])}
        if "l2-category" in data.columns:
            l2_cate_map = {i: c for i, c in zip(data["index"], data["l2-category"])}

        lt = len(data_main)
        hit, tot = 0, 0

        for i in range(lt):
            # Dealing with the normal part
            item_main = data_main.iloc[i]
            idx = item_main["index"]

            if idx in result:
                correct = result[idx]
                assert correct in [0, 1]
                hit += correct
                tot += 1
                continue

            sub_data = data[data["index"] % int(1e6) == idx]
            ret = self.eval_sub_data(sub_data, answer_map)
            result[idx] = ret
            hit += ret
            tot += 1

            data_main.loc[data_main["index"] == idx, "hit"] = ret
            # if (i + 1) % 100 == 0:
            #     eval_logger.info(f"Evaluating: {i + 1}/{lt}, Acc: {hit / tot * 100: .2f}%. ")

        indices = data_main["index"]
        data_main = data_main.set_index("index")
        data_main["category"] = [cate_map[i] if not math.isnan(i) else "uncategorized" for i in indices]
        if "l2-category" in data_main.columns:
            data_main["l2-category"] = [l2_cate_map[i] if not math.isnan(i) else "uncategorized" for i in indices]

        overall_hit_rate, category_hit_rate, l2_category_hit_rate = self.calculate_hit_rates(data_main)

        if "category" in data_main.columns:
            print(f"Category Acc. (dev):")
            for category_key in category_hit_rate:
                if category_key == "split":
                    continue

                category_percentage = category_hit_rate[category_key] * 100
                print(f"\t{category_key}: {category_percentage:.3f}")

        if "l2-category" in data_main.columns:
            print(f"L2-category Acc. (dev):")
            for l2_category_key in l2_category_hit_rate:
                if l2_category_key == "split":
                    continue

                l2_category_percentage = l2_category_hit_rate[l2_category_key] * 100
                print(f"\t{l2_category_key}: {l2_category_percentage:.3f}")

        return overall_hit_rate, category_hit_rate, l2_category_hit_rate