in muss/fairseq/base.py [0:0]
def fairseq_preprocess(dataset, dict_path=None, source_lang='complex', target_lang='simple'):
dataset_dir = get_dataset_dir(dataset)
with lock_directory(dataset_dir):
preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}'
with create_directory_or_skip(preprocessed_dir):
# HACK
for phase in PHASES:
for language, new_language in zip(LANGUAGES, [source_lang, target_lang]):
symlink_path = get_data_filepath(dataset, phase, new_language)
if not symlink_path.exists():
symlink_path.symlink_to(get_data_filepath(dataset, phase, language))
trainpref = str(get_data_filepath(dataset, 'train', 'dummy')).replace('.dummy', '')
validpref = str(get_data_filepath(dataset, 'valid', 'dummy')).replace('.dummy', '')
testpref = str(get_data_filepath(dataset, 'test', 'dummy')).replace('.dummy', '')
args = f'''
--source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref}
--destdir {preprocessed_dir} --bpe sentencepiece
--joined-dictionary --workers 32
'''
if dict_path is not None:
args = f'{args} --srcdict {dict_path}'
args = remove_multiple_whitespaces(args.replace('\n', ' ')).strip(' ')
print(f'fairseq-preprocess {args}')
args = shlex.split(args)
with mock_cli_args(args):
preprocess.cli_main()
return preprocessed_dir