def prepare_dataset()

in src/scripts/gen_embeddings.py [0:0]


def prepare_dataset(dataset_name, entity, indices_path):
    input_filename = f'./data/{dataset_name}/{entity}/completions.json' if entity else f'./data/{dataset_name}/dataset.json'
    indices = create_indices(dataset_name, indices_path)

    calculated_embeddings_indices_path = f'./data/{dataset_name}/{entity}/filenames.json' if entity else f'./data/{dataset_name}/filenames.json'
    if os.path.isfile(calculated_embeddings_indices_path):
        with open(calculated_embeddings_indices_path, 'r') as f:
            finished_indices = json.load(f)
            indices = list(set(indices) ^ set(finished_indices))

    try:
        with open(input_filename, 'r') as fp:
            dataset = {k: v if isinstance(v, list) else [v] for k, v in json.load(fp).items() if k in indices}
    except KeyError:
        print(f"The file {input_filename} doesn't contain necessary keys.")
        return {}
    return dataset