in lmms_eval/tasks/mmbench/mmbench_evals.py [0:0]
def eval_result(self, results, eval_method):
rd.seed(2680)
assert eval_method == "openai"
# Set a large retry number to avoid failure
# model = OpenAI('gpt-3.5-turbo-0613', retry=99)
# double_log(f'Evaluating {eval_file}', fout)
# result_file = eval_file.replace('.xlsx', f'_{eval_method}_result.pkl')
result = {}
# if osp.exists(result_file):
# result = load(result_file)
# data = load(eval_file)
data = pd.DataFrame(results)
data = data.sort_values(by="index")
data["prediction"] = [str(x) for x in data["prediction"]]
for k in data.keys():
data[k.lower() if k not in "ABCD" else k] = data.pop(k)
# meta = load(meta_file)
data_main = data[data["index"] < int(1e6)]
data_main["hit"] = 0
cate_map = {i: c for i, c in zip(data["index"], data["category"])}
answer_map = {i: c for i, c in zip(data["index"], data["answer"])}
if "l2-category" in data.columns:
l2_cate_map = {i: c for i, c in zip(data["index"], data["l2-category"])}
lt = len(data_main)
hit, tot = 0, 0
for i in range(lt):
# Dealing with the normal part
item_main = data_main.iloc[i]
idx = item_main["index"]
if idx in result:
correct = result[idx]
assert correct in [0, 1]
hit += correct
tot += 1
continue
sub_data = data[data["index"] % int(1e6) == idx]
ret = self.eval_sub_data(sub_data, answer_map)
result[idx] = ret
hit += ret
tot += 1
data_main.loc[data_main["index"] == idx, "hit"] = ret
# if (i + 1) % 100 == 0:
# eval_logger.info(f"Evaluating: {i + 1}/{lt}, Acc: {hit / tot * 100: .2f}%. ")
indices = data_main["index"]
data_main = data_main.set_index("index")
data_main["category"] = [cate_map[i] if not math.isnan(i) else "uncategorized" for i in indices]
if "l2-category" in data_main.columns:
data_main["l2-category"] = [l2_cate_map[i] if not math.isnan(i) else "uncategorized" for i in indices]
overall_hit_rate, category_hit_rate, l2_category_hit_rate = self.calculate_hit_rates(data_main)
if "category" in data_main.columns:
print(f"Category Acc. (dev):")
for category_key in category_hit_rate:
if category_key == "split":
continue
category_percentage = category_hit_rate[category_key] * 100
print(f"\t{category_key}: {category_percentage:.3f}")
if "l2-category" in data_main.columns:
print(f"L2-category Acc. (dev):")
for l2_category_key in l2_category_hit_rate:
if l2_category_key == "split":
continue
l2_category_percentage = l2_category_hit_rate[l2_category_key] * 100
print(f"\t{l2_category_key}: {l2_category_percentage:.3f}")
return overall_hit_rate, category_hit_rate, l2_category_hit_rate