in access/resources/prepare.py [0:0]
def prepare_turkcorpus():
dataset = 'turkcorpus'
with create_directory_or_skip(get_dataset_dir(dataset)):
# Import here to avoid circular imports
from access.feature_extraction import get_levenshtein_similarity
prepare_turkcorpus_lower()
url = 'https://github.com/cocoxu/simplification.git'
output_dir = Path(tempfile.mkdtemp())
git_clone(url, output_dir)
print('Processing...')
# Only rename files and put them in local directory architecture
turkcorpus_truecased_dir = output_dir / 'data/turkcorpus/truecased'
for (old_phase, new_phase) in [('test', 'test'), ('tune', 'valid')]:
# (1) read the .tsv for which each line is tab separated:
# `idx, complex_sentence, *turk_sentences = line.split('\t')`
# (2) replace lrb and rrb, tokenize
# (3) Turk sentences are shuffled for each sample so need to realign them with turkcorpus lower
tsv_filepath = turkcorpus_truecased_dir / f'{old_phase}.8turkers.organized.tsv'
output_complex_filepath = get_data_filepath(dataset, new_phase, 'complex')
output_ref_filepaths = [get_data_filepath(dataset, new_phase, 'simple.turk', i) for i in range(8)]
# These files will be used to reorder the shuffled ref sentences
ordered_ref_filepaths = [
get_data_filepath('turkcorpus_lower', new_phase, 'simple.turk', i) for i in range(8)
]
with write_lines_in_parallel([output_complex_filepath] + output_ref_filepaths) as files:
input_filepaths = [tsv_filepath] + ordered_ref_filepaths
for tsv_line, *ordered_ref_sentences in yield_lines_in_parallel(input_filepaths):
sample_id, complex_sentence, *shuffled_ref_sentences = [
word_tokenize(normalize_quotes(replace_lrb_rrb(s))) for s in tsv_line.split('\t')
]
reordered_sentences = []
for ordered_ref_sentence in ordered_ref_sentences:
# Find the position of the ref_sentence in the shuffled sentences
similarities = [
get_levenshtein_similarity(ordered_ref_sentence.replace(' ', ''),
shuffled_ref_sentence.lower().replace(' ', ''))
for shuffled_ref_sentence in shuffled_ref_sentences
]
idx = np.argmax(similarities)
# A few sentences have differing punctuation marks
assert similarities[idx] > 0.98, \
f'{ordered_ref_sentence} != {shuffled_ref_sentences[idx].lower()} {similarities[idx]:.2f}'
reordered_sentences.append(shuffled_ref_sentences.pop(idx))
assert len(shuffled_ref_sentences) == 0
assert len(reordered_sentences) == 8
files.write([complex_sentence] + reordered_sentences)
return dataset