def evaluation()

in scripts/evaluate_hybrid.py [0:0]


def evaluation(in_prefix, *, eval_type, rank_ulimit):
    k_values = [1, 2, 3, 4, 5, 10, 25, 50, 100, 200]
    total = 0
    correct = {}
    for k in k_values:
        correct[k] = 0
    map_result = 0.0
    mrr_result = 0.0

    judge_key = 'NOT_INTIALIZED'

    for inpath in sorted(glob.glob(in_prefix + "*")):
        print(inpath)
        infile = open(inpath, 'rt', encoding='utf-8')
        for i, line in enumerate(infile):
            line = line.strip()
            question = json.loads(line)

            if i == 0:
                if isinstance(question['passages'][0]['judge'], dict):
                    judge_key = 'judge_contain_some'
                else:
                    judge_key = None

            total += 1
            if len(question[eval_type]) ==0:
                print("NONE")
                continue
            rank_list = [("", _determine_judge(doc, judge_key)) for doc in question[eval_type]]
            rank_list = rank_list[:rank_ulimit]
            # calculate recall
            for k in k_values:
                if k > rank_ulimit:
                    break
                for j in range(k):
                    if j >= len(rank_list): break
                    if rank_list[j][1] == 1:
                        correct[k] += 1
                        break
            # calculate MAP and MRR
            cur_map = 0.0
            cur_map_total = 0.0
            cur_mrr = 0.0
            scores = []
            for j in range(len(rank_list)):
                cur_dist, cur_label = rank_list[j]
                scores.append(cur_dist)
                if cur_label == 1:
                    cur_map_total += 1
                    cur_map += cur_map_total / (1 + j)
                    if cur_mrr == 0.0:
                        cur_mrr = 1 / float(1 + j)

            if cur_map_total != 0.0: cur_map = cur_map / cur_map_total
            map_result += cur_map
            mrr_result += cur_mrr

    # output accuracy
    out_report = ''
    for k in k_values:
        cur_accuracy = correct[k] / float(total) * 100
        out_report += 'Top-%d:\t%.2f\n' % (k, cur_accuracy)
    map_result = map_result / float(total) * 100
    mrr_result = mrr_result / float(total) * 100
    out_report += 'MAP:\t%.2f\n' % (map_result)
    out_report += 'MRR:\t%.2f\n' % (mrr_result)
    out_report += 'Total:\t%d\n' % (total)
    out_report += 'Size of rank list:\t%d\n' % (rank_ulimit)
    acc = correct[1] / float(total)

    print(f'eval_type: {eval_type}')
    print(f'judge_key: {judge_key}')
    print(out_report)
    print()

    return acc