def evaluate_hypo()

in preprocess/evaluate_hypo.py [0:0]


def evaluate_hypo(source_file, target_file, hypo_file, output_file, eval_rouge=True, rouge_package='files2rouge',
                  no_prec_recall=False):
    n_s = count_lines_in_text_file(source_file)
    n_t = count_lines_in_text_file(target_file)
    n_h = count_lines_in_text_file(hypo_file)

    rouge1, rouge2, rougeL = None, None, None

    assert n_s == n_t == n_h, \
        "Number of lines not consistent: {}, {}, {}".format(n_s, n_t, n_h)

    metric_names = [
        'ent_count_hypo', 'ent_count_hypo_source',
        'ent_count_target', 'ent_count_target_hypo',
        'ent_count_hypo_target',
        'precision_source',
        'precision_target',
        'recall',
    ]
    if eval_rouge and rouge_package == "rouge_scorer":
        metric_names += ['rouge1', 'rouge2', 'rougeL', ]

    if eval_rouge and rouge_package == "files2rouge":
        os.environ["CLASSPATH"] = "/home/ec2-user/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar"

        if not os.path.exists(target_file + ".tokenized"):
            # remove line seperator u'\u2028'
            with open(target_file, 'r') as f:
                lines = f.readlines()
            with open(target_file + '.tmp', 'w') as fout:
                for line in lines:
                    fout.write(" ".join(line.strip().split(u'\u2028')) + '\n')
            assert n_t == count_lines_in_text_file(target_file + '.tmp')
            print("Tokenizing:", target_file)
            cmd = "cat {} | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > {}".format(
                target_file + '.tmp',
                target_file + ".tokenized"
            )
            # print(cmd)
            with subprocess.Popen(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    shell=True
            ) as p:
                stdout, stderr = p.communicate()
                print(stdout)

        if not os.path.exists(hypo_file + ".tokenized"):
            print("Tokenizing:", hypo_file)
            cmd = "cat {} | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > {}".format(
                hypo_file,
                hypo_file + ".tokenized"
            )
            # print(cmd)
            with subprocess.Popen(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.STDOUT,
                    shell=True
            ) as p:
                stdout, stderr = p.communicate()
                print(stdout)

        fix_empty_lines(hypo_file + ".tokenized")

        cmd = "files2rouge {} {}".format(
            hypo_file + ".tokenized",
            target_file + ".tokenized"
        )
        rouge1_re = re.compile(r"ROUGE-1 Average_F: ([0-9\\.]+)")
        rouge2_re = re.compile(r"ROUGE-2 Average_F: ([0-9\\.]+)")
        rougeL_re = re.compile(r"ROUGE-L Average_F: ([0-9\\.]+)")

        with subprocess.Popen(
                cmd.split(),
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT
        ) as p:
            stdout, stderr = p.communicate()
            out_text = stdout.decode()
            print(out_text)
            rouge1 = rouge1_re.findall(out_text)[0]
            rouge2 = rouge2_re.findall(out_text)[0]
            rougeL = rougeL_re.findall(out_text)[0]

    result = {}
    for m in metric_names:
        result[m] = []

    if eval_rouge and rouge_package == "rouge_scorer":
        scorer = rouge_scorer.RougeScorer([name for name in metric_names if name.startswith("rouge")],
                                          use_stemmer=True)

    # step = n_h // num_workers
    # offsets = [i * step for i in range(num_workers)]
    # offsets.append(n_h)
    if no_prec_recall:
        return
    if args.scispacy:
        nlp = spacy.load("en_ner_bc5cdr_md")
        print("Using scispacy!")
    else:
        nlp = spacy.load("en_core_web_lg")
    with open(source_file, 'r') as s_f, \
            open(target_file, 'r') as t_f, \
            open(hypo_file, 'r') as h_f:
        for _ in tqdm(range(n_h)):
            sline = s_f.readline().strip()
            tline = t_f.readline().strip()
            hline = h_f.readline().strip()
            if eval_rouge and rouge_package == "rouge_scorer":
                rouge = scorer.score(tline, hline)
                for m in rouge.keys():
                    result[m].append(rouge[m].fmeasure)

            ent_count_hypo, ent_count_hypo_source = ent_count_match(nlp, hline, sline, args.scispacy)
            ent_count_hypo, ent_count_hypo_target = ent_count_match(nlp, hline, tline, args.scispacy)
            ent_count_target, ent_count_target_hypo = ent_count_match(nlp, tline, hline, args.scispacy)

            result['ent_count_hypo'].append(ent_count_hypo)
            result['ent_count_hypo_source'].append(ent_count_hypo_source)
            result['ent_count_target'].append(ent_count_target)
            result['ent_count_target_hypo'].append(ent_count_target_hypo)
            result['ent_count_hypo_target'].append(ent_count_hypo_target)
            if ent_count_hypo == 0:
                result['precision_source'].append(np.nan)
                result['precision_target'].append(np.nan)
            else:
                result['precision_source'].append(ent_count_hypo_source * 1.0 / ent_count_hypo)
                result['precision_target'].append(ent_count_hypo_target * 1.0 / ent_count_hypo)
            if ent_count_target == 0:
                result['recall'].append(np.nan)
            else:
                result['recall'].append(ent_count_target_hypo * 1.0 / ent_count_target)

    avg_metrics = {}
    for k in metric_names:
        avg_metrics[k] = np.nanmean(np.array(result[k]))
        print("average {}={}".format(k, avg_metrics[k]))

    if rouge1 and rouge2 and rougeL and not no_prec_recall:
        macro_recall = avg_metrics['ent_count_target_hypo'] / avg_metrics['ent_count_target']
        macro_prec_target = avg_metrics['ent_count_hypo_target'] / avg_metrics['ent_count_hypo']
        micro_recall = avg_metrics['recall']
        micro_prec_target = avg_metrics['precision_target']
        display_text = f"{rouge1} {rouge2} {rougeL} " \
                       f"{avg_metrics['ent_count_hypo']} {avg_metrics['ent_count_hypo_source']} " \
                       f"{avg_metrics['ent_count_hypo_source'] / avg_metrics['ent_count_hypo']} " \
                       f"{avg_metrics['ent_count_target']} {avg_metrics['ent_count_target_hypo']} " \
                       f"{macro_recall} " \
                       f"{avg_metrics['ent_count_hypo_target']} " \
                       f"{macro_prec_target} " \
                       f"{avg_metrics['precision_source']} {micro_prec_target} " \
                       f"{micro_recall} " \
                       f"{2 * macro_recall * macro_prec_target / (macro_recall + macro_prec_target)} " \
                       f"{2 * micro_recall * micro_prec_target / (micro_recall + micro_prec_target)}"
        print(display_text)

    with open(output_file, 'w') as outfile:
        for i in range(n_h):
            text = ""
            for k in metric_names:
                text += "{} ".format(result[k][i])
            outfile.write(text + '\n')
    print("Output saved: ", output_file)