zero_shot_exp/zero_shot.py (29 lines of code) (raw):

from datasets import Dataset, Features, Value, load_dataset, Image, Sequence TEST_SUBSET_LEN = 200 TRAIN_SUBSET_LEN = 1700 FEATURES = Features( { "images": Sequence(Image(decode=True)), "texts": [ { "user": Value("string"), "assistant": Value("string"), "source": Value("string"), } ], } ) ds = load_dataset("HuggingFaceM4/Docmatix", "images", streaming=True) test_subset = [] train_subset = [] for idx, sample in enumerate(ds['train']): if idx < TEST_SUBSET_LEN: test_subset.append(sample) if idx >= TEST_SUBSET_LEN - 1: if idx >= TEST_SUBSET_LEN + TRAIN_SUBSET_LEN - 1: break train_subset.append(sample) new_test_data = Dataset.from_list(test_subset, features=FEATURES) new_train_data = Dataset.from_list(train_subset, features=FEATURES) new_test_data.push_to_hub('HuggingFaceM4/Docmatix', 'zero-shot-exp', split='test') new_train_data.push_to_hub('HuggingFaceM4/Docmatix', 'zero-shot-exp', split='train')