in low_rank_comparisons/eval/eval.py [0:0]
def parse(refs_path, hyps_path, num_refs, lng='en'):
logging.info('STARTING TO PARSE INPUTS...')
print('STARTING TO PARSE INPUTS...')
# references
references = []
for i in range(num_refs):
fname = refs_path + str(i) if num_refs > 1 else refs_path
with codecs.open(fname, 'r', 'utf-8') as f:
texts = f.read().split('\n')
for j, text in enumerate(texts):
if len(references) <= j:
references.append([text])
else:
references[j].append(text)
# references tokenized
references_tok = copy.copy(references)
for i, refs in enumerate(references_tok):
if lng == 'ru':
references_tok[i] = [' '.join([_.text for _ in tokenize(ref)]) for ref in refs]
else:
references_tok[i] = [' '.join(nltk.word_tokenize(ref)) for ref in refs]
# hypothesis
with codecs.open(hyps_path, 'r', 'utf-8') as f:
hypothesis = f.read().split('\n')
# hypothesis tokenized
hypothesis_tok = copy.copy(hypothesis)
if lng == 'ru':
hypothesis_tok = [' '.join([_.text for _ in tokenize(hyp)]) for hyp in hypothesis_tok]
else:
hypothesis_tok = [' '.join(nltk.word_tokenize(hyp)) for hyp in hypothesis_tok]
logging.info('FINISHING TO PARSE INPUTS...')
print('FINISHING TO PARSE INPUTS...')
return references, references_tok, hypothesis, hypothesis_tok