def create_preprocessed_dataset_one_preprocessor()

in access/resources/datasets.py [0:0]


def create_preprocessed_dataset_one_preprocessor(dataset, preprocessor, n_jobs):
    new_dataset = get_preprocessed_dataset_name(dataset, preprocessor)
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        print(f'Creating preprocessed dataset with {preprocessor}: {dataset} -> {new_dataset}')
        new_dataset_dir = get_dataset_dir(new_dataset)
        filepaths_dict = get_filepaths_dict(dataset)
        new_filepaths_dict = get_filepaths_dict(new_dataset)
        for phase in PHASES:
            if not filepaths_dict[phase, 'complex'].exists() or not filepaths_dict[phase, 'complex'].exists():
                continue
            parallel_file_pair_preprocessor = get_parallel_file_pair_preprocessor(
                preprocessor.encode_file_pair,
                n_jobs=n_jobs,
            )
            parallel_file_pair_preprocessor(filepaths_dict[phase, 'complex'], filepaths_dict[phase, 'simple'],
                                            new_filepaths_dict[phase, 'complex'], new_filepaths_dict[phase, 'simple'])
            previous_preprocessors = load_preprocessors(get_dataset_dir(dataset))
        if previous_preprocessors is not None:
            preprocessors = previous_preprocessors + [preprocessor]
        else:
            preprocessors = [preprocessor]
        dump_preprocessors(preprocessors, new_dataset_dir)
        with open(new_dataset_dir / 'original_dataset', 'w') as f:
            f.write(dataset + '\n')

    return new_dataset