in distilvit/pexels.py [0:0]
def generate():
entries = []
generator = AltGenerator({})
pbar = tqdm(total=5000)
image_files = [image for image in os.listdir("images") if image.endswith(".jpg")]
image_by_id = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for i in range(0, len(image_files), 5):
image_ids = image_files[i : i + 5]
futures.append(
executor.submit(process_images, image_ids, generator, image_by_id, pbar)
)
idx = 0
for future in concurrent.futures.as_completed(futures):
cached_images = future.result()
for image_id, payload in cached_images.items():
new_entry = {
"image_id": idx,
"alt_text": payload["alt_text"],
"objects": payload["detected_objects"],
"image": image_by_id[image_id],
"source": "pexels",
}
entries.append(new_entry)
idx += 1
pbar.close()
features = Features(
{
"image_id": Value("int32"),
"alt_text": Value("string"),
"objects": Sequence(Value("string")),
"image": DImage(),
"source": Value("string"),
}
)
new_entries_df = pd.DataFrame(entries)
new_dataset = Dataset.from_pandas(new_entries_df, features=features)
new_dataset = new_dataset.shuffle(seed=42)
new_dataset.save_to_disk("new_pexels_dataset")
new_dataset.push_to_hub("tarekziade/pexels-gpt4o")