in access/resources/prepare.py [0:0]
def prepare_turkcorpus_lower():
dataset = 'turkcorpus_lower'
with create_directory_or_skip(get_dataset_dir(dataset)):
url = 'https://github.com/cocoxu/simplification.git'
output_dir = Path(tempfile.mkdtemp())
git_clone(url, output_dir)
print(output_dir)
print('Processing...')
# Only rename files and put them in local directory architecture
turkcorpus_lower_dir = output_dir / 'data/turkcorpus'
print(turkcorpus_lower_dir)
for (old_phase, new_phase) in [('test', 'test'), ('tune', 'valid')]:
for (old_language_name, new_language_name) in [('norm', 'complex'), ('simp', 'simple')]:
old_path = turkcorpus_lower_dir / f'{old_phase}.8turkers.tok.{old_language_name}'
new_path = get_data_filepath('turkcorpus_lower', new_phase, new_language_name)
shutil.copyfile(old_path, new_path)
add_newline_at_end_of_file(new_path)
shutil.move(replace_lrb_rrb_file(new_path), new_path)
for i in range(8):
old_path = turkcorpus_lower_dir / f'{old_phase}.8turkers.tok.turk.{i}'
new_path = get_data_filepath('turkcorpus_lower', new_phase, 'simple.turk', i=i)
shutil.copyfile(old_path, new_path)
add_newline_at_end_of_file(new_path)
shutil.move(replace_lrb_rrb_file(new_path), new_path)
print('Done.')
return dataset