def evaluate()

in scripts_mgenre/evaluate_mel.py [0:0]


def evaluate(guess_path, gold_path):
    results = defaultdict(lambda: defaultdict(list))
    for fname in tqdm(sorted(os.listdir(guess_path))):

        with jsonlines.open(os.path.join(guess_path, fname)) as f:
            pred = [e for e in f]

        with jsonlines.open(os.path.join(gold_path, fname)) as f:
            gold = [e for e in f]

        recalls = []
        for dg, dp in zip(gold, pred):
            assert dg["id"] == dp["id"]
            dp["predictions"] = [
                e["answer"][0] if isinstance(e["answer"], list) else e
                for e in dp["predictions"]
            ]
            dp["predictions"] = [e for e in dp["predictions"] if e != None]

            if len(dg["output"][0]["answer"]) != 1:
                recalls.append(math.inf)
            else:
                recalls.append(
                    1
                    + min(
                        [
                            i
                            for i, e in enumerate(dp["predictions"])
                            if e in dg["output"][0]["answer"]
                        ]
                        + [math.inf]
                    )
                )

        lang = fname[:2]
        results["R@1"][lang] = [sum(e <= 1 for e in recalls) / len(recalls)]
        results["R@10"][lang] = [sum(e <= 10 for e in recalls) / len(recalls)]

        results["R@1"]["micro-avg"] += [e <= 1 for e in recalls]
        results["R@1"]["macro-avg"] += results["R@1"][lang]

        results["R@10"]["micro-avg"] += [e <= 10 for e in recalls]
        results["R@10"]["macro-avg"] += results["R@10"][lang]

    results_final = defaultdict(dict)
    for k1, v1 in results.items():
        for k2, v2 in v1.items():
            results_final[k2][k1] = sum(v2) / len(v2)

    return results_final