def get_datasets()

in create_only_with_pdfs/upload_data.py [0:0]


def get_datasets():
    if os.path.isdir('/fsx/m4/datasets/docmatix_pdf/concatenated'):
       return load_from_disk('/fsx/m4/datasets/docmatix_pdf/concatenated') 
    
    hf_datasets = []
    for shard_nr in tqdm(range(200)):
        try:
            hf_datasets.append(load_from_disk(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}'))
        except Exception as e:
            # if os.path.isdir(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}'):
            #     shutil.rmtree(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}')
            print(f"Error loading dataset from: {shard_nr}")
            print(e)
    hf_data = concatenate_datasets(hf_datasets)
    hf_data.save_to_disk('/fsx/m4/datasets/docmatix_pdf/concatenated')
    return hf_data