in preprocess/evaluate_hypo.py [0:0]
def evaluate_hypo(source_file, target_file, hypo_file, output_file, eval_rouge=True, rouge_package='files2rouge',
no_prec_recall=False):
n_s = count_lines_in_text_file(source_file)
n_t = count_lines_in_text_file(target_file)
n_h = count_lines_in_text_file(hypo_file)
rouge1, rouge2, rougeL = None, None, None
assert n_s == n_t == n_h, \
"Number of lines not consistent: {}, {}, {}".format(n_s, n_t, n_h)
metric_names = [
'ent_count_hypo', 'ent_count_hypo_source',
'ent_count_target', 'ent_count_target_hypo',
'ent_count_hypo_target',
'precision_source',
'precision_target',
'recall',
]
if eval_rouge and rouge_package == "rouge_scorer":
metric_names += ['rouge1', 'rouge2', 'rougeL', ]
if eval_rouge and rouge_package == "files2rouge":
os.environ["CLASSPATH"] = "/home/ec2-user/stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar"
if not os.path.exists(target_file + ".tokenized"):
# remove line seperator u'\u2028'
with open(target_file, 'r') as f:
lines = f.readlines()
with open(target_file + '.tmp', 'w') as fout:
for line in lines:
fout.write(" ".join(line.strip().split(u'\u2028')) + '\n')
assert n_t == count_lines_in_text_file(target_file + '.tmp')
print("Tokenizing:", target_file)
cmd = "cat {} | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > {}".format(
target_file + '.tmp',
target_file + ".tokenized"
)
# print(cmd)
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True
) as p:
stdout, stderr = p.communicate()
print(stdout)
if not os.path.exists(hypo_file + ".tokenized"):
print("Tokenizing:", hypo_file)
cmd = "cat {} | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > {}".format(
hypo_file,
hypo_file + ".tokenized"
)
# print(cmd)
with subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True
) as p:
stdout, stderr = p.communicate()
print(stdout)
fix_empty_lines(hypo_file + ".tokenized")
cmd = "files2rouge {} {}".format(
hypo_file + ".tokenized",
target_file + ".tokenized"
)
rouge1_re = re.compile(r"ROUGE-1 Average_F: ([0-9\\.]+)")
rouge2_re = re.compile(r"ROUGE-2 Average_F: ([0-9\\.]+)")
rougeL_re = re.compile(r"ROUGE-L Average_F: ([0-9\\.]+)")
with subprocess.Popen(
cmd.split(),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
) as p:
stdout, stderr = p.communicate()
out_text = stdout.decode()
print(out_text)
rouge1 = rouge1_re.findall(out_text)[0]
rouge2 = rouge2_re.findall(out_text)[0]
rougeL = rougeL_re.findall(out_text)[0]
result = {}
for m in metric_names:
result[m] = []
if eval_rouge and rouge_package == "rouge_scorer":
scorer = rouge_scorer.RougeScorer([name for name in metric_names if name.startswith("rouge")],
use_stemmer=True)
# step = n_h // num_workers
# offsets = [i * step for i in range(num_workers)]
# offsets.append(n_h)
if no_prec_recall:
return
if args.scispacy:
nlp = spacy.load("en_ner_bc5cdr_md")
print("Using scispacy!")
else:
nlp = spacy.load("en_core_web_lg")
with open(source_file, 'r') as s_f, \
open(target_file, 'r') as t_f, \
open(hypo_file, 'r') as h_f:
for _ in tqdm(range(n_h)):
sline = s_f.readline().strip()
tline = t_f.readline().strip()
hline = h_f.readline().strip()
if eval_rouge and rouge_package == "rouge_scorer":
rouge = scorer.score(tline, hline)
for m in rouge.keys():
result[m].append(rouge[m].fmeasure)
ent_count_hypo, ent_count_hypo_source = ent_count_match(nlp, hline, sline, args.scispacy)
ent_count_hypo, ent_count_hypo_target = ent_count_match(nlp, hline, tline, args.scispacy)
ent_count_target, ent_count_target_hypo = ent_count_match(nlp, tline, hline, args.scispacy)
result['ent_count_hypo'].append(ent_count_hypo)
result['ent_count_hypo_source'].append(ent_count_hypo_source)
result['ent_count_target'].append(ent_count_target)
result['ent_count_target_hypo'].append(ent_count_target_hypo)
result['ent_count_hypo_target'].append(ent_count_hypo_target)
if ent_count_hypo == 0:
result['precision_source'].append(np.nan)
result['precision_target'].append(np.nan)
else:
result['precision_source'].append(ent_count_hypo_source * 1.0 / ent_count_hypo)
result['precision_target'].append(ent_count_hypo_target * 1.0 / ent_count_hypo)
if ent_count_target == 0:
result['recall'].append(np.nan)
else:
result['recall'].append(ent_count_target_hypo * 1.0 / ent_count_target)
avg_metrics = {}
for k in metric_names:
avg_metrics[k] = np.nanmean(np.array(result[k]))
print("average {}={}".format(k, avg_metrics[k]))
if rouge1 and rouge2 and rougeL and not no_prec_recall:
macro_recall = avg_metrics['ent_count_target_hypo'] / avg_metrics['ent_count_target']
macro_prec_target = avg_metrics['ent_count_hypo_target'] / avg_metrics['ent_count_hypo']
micro_recall = avg_metrics['recall']
micro_prec_target = avg_metrics['precision_target']
display_text = f"{rouge1} {rouge2} {rougeL} " \
f"{avg_metrics['ent_count_hypo']} {avg_metrics['ent_count_hypo_source']} " \
f"{avg_metrics['ent_count_hypo_source'] / avg_metrics['ent_count_hypo']} " \
f"{avg_metrics['ent_count_target']} {avg_metrics['ent_count_target_hypo']} " \
f"{macro_recall} " \
f"{avg_metrics['ent_count_hypo_target']} " \
f"{macro_prec_target} " \
f"{avg_metrics['precision_source']} {micro_prec_target} " \
f"{micro_recall} " \
f"{2 * macro_recall * macro_prec_target / (macro_recall + macro_prec_target)} " \
f"{2 * micro_recall * micro_prec_target / (micro_recall + micro_prec_target)}"
print(display_text)
with open(output_file, 'w') as outfile:
for i in range(n_h):
text = ""
for k in metric_names:
text += "{} ".format(result[k][i])
outfile.write(text + '\n')
print("Output saved: ", output_file)