def mix_datasets()

in muss/resources/datasets.py [0:0]


def mix_datasets(datasets, props=None, new_dataset=None):
    if len(set(datasets)) == 1:
        return datasets[0]
    if props is None:
        props = [1 / len(datasets)] * len(datasets)
    assert len(props) == len(datasets)
    assert all([get_dataset_dir(dataset).exists() for dataset in datasets])
    # Sort in unison according to dataset names
    datasets, props = zip(*sorted(zip(datasets, props)))
    if new_dataset is None:
        new_dataset = 'mix-' + '-'.join([f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props)])
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        print('Mixing datasets...')
        for phase, language in product(PHASES, LANGUAGES):
            input_files = [get_data_filepath(dataset, phase, language) for dataset in datasets]
            # If one of the input files does not exist, we remove it and its prop and renormalize
            input_files, current_props = zip(
                *[(input_file, prop) for input_file, prop in zip(input_files, props) if input_file.exists()]
            )
            current_props = np.array(current_props) / np.sum(current_props)
            output_file = get_data_filepath(new_dataset, phase, language)
            # TODO: Jointly mix files
            # The seed is set everytime mix is called, therefore they should be mixed in the same order
            mix_files(input_files, current_props, output_file)
            shuffle_file_lines(output_file)
    return new_dataset