in muss/resources/datasets.py [0:0]
def mix_datasets(datasets, props=None, new_dataset=None):
if len(set(datasets)) == 1:
return datasets[0]
if props is None:
props = [1 / len(datasets)] * len(datasets)
assert len(props) == len(datasets)
assert all([get_dataset_dir(dataset).exists() for dataset in datasets])
# Sort in unison according to dataset names
datasets, props = zip(*sorted(zip(datasets, props)))
if new_dataset is None:
new_dataset = 'mix-' + '-'.join([f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props)])
with create_directory_or_skip(get_dataset_dir(new_dataset)):
print('Mixing datasets...')
for phase, language in product(PHASES, LANGUAGES):
input_files = [get_data_filepath(dataset, phase, language) for dataset in datasets]
# If one of the input files does not exist, we remove it and its prop and renormalize
input_files, current_props = zip(
*[(input_file, prop) for input_file, prop in zip(input_files, props) if input_file.exists()]
)
current_props = np.array(current_props) / np.sum(current_props)
output_file = get_data_filepath(new_dataset, phase, language)
# TODO: Jointly mix files
# The seed is set everytime mix is called, therefore they should be mixed in the same order
mix_files(input_files, current_props, output_file)
shuffle_file_lines(output_file)
return new_dataset