in muss/resources/datasets.py [0:0]
def create_preprocessed_dataset_one_preprocessor(dataset, preprocessor, n_jobs):
new_dataset = get_preprocessed_dataset_name(dataset, preprocessor)
with create_directory_or_skip(get_dataset_dir(new_dataset)):
print(f'Creating preprocessed dataset with {preprocessor}: {dataset} -> {new_dataset}')
new_dataset_dir = get_dataset_dir(new_dataset)
filepaths_dict = get_filepaths_dict(dataset)
new_filepaths_dict = get_filepaths_dict(new_dataset)
for phase in PHASES:
if not filepaths_dict[phase, 'complex'].exists() or not filepaths_dict[phase, 'complex'].exists():
continue
parallel_file_pair_preprocessor = get_parallel_file_pair_preprocessor(
preprocessor.encode_file_pair,
n_jobs=n_jobs,
)
parallel_file_pair_preprocessor(
filepaths_dict[phase, 'complex'],
filepaths_dict[phase, 'simple'],
new_filepaths_dict[phase, 'complex'],
new_filepaths_dict[phase, 'simple'],
)
previous_preprocessors = load_preprocessors(get_dataset_dir(dataset))
if previous_preprocessors is not None:
preprocessors = previous_preprocessors + [preprocessor]
else:
preprocessors = [preprocessor]
dump_preprocessors(preprocessors, new_dataset_dir)
with open(new_dataset_dir / 'original_dataset', 'w') as f:
f.write(dataset + '\n')
if hasattr(preprocessor, 'copy_sentencepiece_files_to_dir'):
preprocessor.copy_sentencepiece_files_to_dir(new_dataset_dir)
return new_dataset