create_only_with_pdfs/upload_data.py (22 lines of code) (raw):
from datasets import load_from_disk, concatenate_datasets
from tqdm import tqdm
import os
def get_datasets():
if os.path.isdir('/fsx/m4/datasets/docmatix_pdf/concatenated'):
return load_from_disk('/fsx/m4/datasets/docmatix_pdf/concatenated')
hf_datasets = []
for shard_nr in tqdm(range(200)):
try:
hf_datasets.append(load_from_disk(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}'))
except Exception as e:
# if os.path.isdir(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}'):
# shutil.rmtree(f'/fsx/m4/datasets/docmatix_pdf/shard_{shard_nr}')
print(f"Error loading dataset from: {shard_nr}")
print(e)
hf_data = concatenate_datasets(hf_datasets)
hf_data.save_to_disk('/fsx/m4/datasets/docmatix_pdf/concatenated')
return hf_data
data = get_datasets()
print(data.features)
print(data[0]['texts'])
print(data[0]['pdf'][:10])
print(len(data))
data.push_to_hub('HuggingFaceM4/Docmatix', 'pdf')