def generate()

in distilvit/pexels.py [0:0]


def generate():
    entries = []
    generator = AltGenerator({})
    pbar = tqdm(total=5000)

    image_files = [image for image in os.listdir("images") if image.endswith(".jpg")]
    image_by_id = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for i in range(0, len(image_files), 5):
            image_ids = image_files[i : i + 5]
            futures.append(
                executor.submit(process_images, image_ids, generator, image_by_id, pbar)
            )

        idx = 0
        for future in concurrent.futures.as_completed(futures):
            cached_images = future.result()
            for image_id, payload in cached_images.items():
                new_entry = {
                    "image_id": idx,
                    "alt_text": payload["alt_text"],
                    "objects": payload["detected_objects"],
                    "image": image_by_id[image_id],
                    "source": "pexels",
                }
                entries.append(new_entry)
                idx += 1

    pbar.close()

    features = Features(
        {
            "image_id": Value("int32"),
            "alt_text": Value("string"),
            "objects": Sequence(Value("string")),
            "image": DImage(),
            "source": Value("string"),
        }
    )

    new_entries_df = pd.DataFrame(entries)
    new_dataset = Dataset.from_pandas(new_entries_df, features=features)
    new_dataset = new_dataset.shuffle(seed=42)
    new_dataset.save_to_disk("new_pexels_dataset")
    new_dataset.push_to_hub("tarekziade/pexels-gpt4o")