def fairseq_preprocess()

in muss/fairseq/base.py [0:0]


def fairseq_preprocess(dataset, dict_path=None, source_lang='complex', target_lang='simple'):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}'
        with create_directory_or_skip(preprocessed_dir):
            # HACK
            for phase in PHASES:
                for language, new_language in zip(LANGUAGES, [source_lang, target_lang]):
                    symlink_path = get_data_filepath(dataset, phase, new_language)
                    if not symlink_path.exists():
                        symlink_path.symlink_to(get_data_filepath(dataset, phase, language))
            trainpref = str(get_data_filepath(dataset, 'train', 'dummy')).replace('.dummy', '')
            validpref = str(get_data_filepath(dataset, 'valid', 'dummy')).replace('.dummy', '')
            testpref = str(get_data_filepath(dataset, 'test', 'dummy')).replace('.dummy', '')
            args = f'''
                --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref}
                --destdir {preprocessed_dir} --bpe sentencepiece
                --joined-dictionary --workers 32
            '''
            if dict_path is not None:
                args = f'{args} --srcdict {dict_path}'
            args = remove_multiple_whitespaces(args.replace('\n', ' ')).strip(' ')
            print(f'fairseq-preprocess {args}')
            args = shlex.split(args)
            with mock_cli_args(args):
                preprocess.cli_main()
        return preprocessed_dir