in scripts_mgenre/evaluate_mel.py [0:0]
def evaluate(guess_path, gold_path):
results = defaultdict(lambda: defaultdict(list))
for fname in tqdm(sorted(os.listdir(guess_path))):
with jsonlines.open(os.path.join(guess_path, fname)) as f:
pred = [e for e in f]
with jsonlines.open(os.path.join(gold_path, fname)) as f:
gold = [e for e in f]
recalls = []
for dg, dp in zip(gold, pred):
assert dg["id"] == dp["id"]
dp["predictions"] = [
e["answer"][0] if isinstance(e["answer"], list) else e
for e in dp["predictions"]
]
dp["predictions"] = [e for e in dp["predictions"] if e != None]
if len(dg["output"][0]["answer"]) != 1:
recalls.append(math.inf)
else:
recalls.append(
1
+ min(
[
i
for i, e in enumerate(dp["predictions"])
if e in dg["output"][0]["answer"]
]
+ [math.inf]
)
)
lang = fname[:2]
results["R@1"][lang] = [sum(e <= 1 for e in recalls) / len(recalls)]
results["R@10"][lang] = [sum(e <= 10 for e in recalls) / len(recalls)]
results["R@1"]["micro-avg"] += [e <= 1 for e in recalls]
results["R@1"]["macro-avg"] += results["R@1"][lang]
results["R@10"]["micro-avg"] += [e <= 10 for e in recalls]
results["R@10"]["macro-avg"] += results["R@10"][lang]
results_final = defaultdict(dict)
for k1, v1 in results.items():
for k2, v2 in v1.items():
results_final[k2][k1] = sum(v2) / len(v2)
return results_final