mlebench/competitions/herbarium-2020-fgvc7/prepare.py [53:118]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        old_train_metadata = json.load(f)

    # Organize data by category so that we can split per-category later
    annotations_images_by_category = {}  # We'll collect both `annotations` and `images` here
    for annotation, image in list(
        zip(old_train_metadata["annotations"], old_train_metadata["images"])
    ):
        assert (
            annotation["image_id"] == image["id"]
        ), f"Mismatching image_id in annotation and image: {annotation['image_id']} vs {image['id']}"
        category_id = annotation["category_id"]
        if category_id not in annotations_images_by_category:
            annotations_images_by_category[category_id] = []
        annotations_images_by_category[category_id].append(
            {
                "annotation": annotation,
                "image": image,
            }
        )

    # Split train/test
    train_sample_count = 0  # Useful for tqdm later
    train_annotations_images_by_category = {}
    test_annotations_images_by_category = {}

    for category_id, annotations_images in tqdm(
        annotations_images_by_category.items(), desc="Assigning train/test splits"
    ):
        # Create split by "category" (class): Each category needs to be in both train and test (80:20) as per original ratio
        test_size = 0.2
        n_samples = len(annotations_images)
        if n_samples == 1:
            # If only one sample, put it in train
            train_annotations_images = annotations_images
            test_annotations_images = []
        elif n_samples < 5:  # Minimum 5 samples to ensure at least 1 in test
            # Ensure at least 1 sample in test
            test_size = max(1, int(n_samples * test_size))
            train_annotations_images = annotations_images[:-test_size]
            test_annotations_images = annotations_images[-test_size:]
        else:
            train_annotations_images, test_annotations_images = train_test_split(
                annotations_images, test_size=test_size, random_state=0
            )

        train_annotations_images_by_category[category_id] = train_annotations_images
        test_annotations_images_by_category[category_id] = test_annotations_images
        train_sample_count += len(train_annotations_images)

    # Add to train set
    new_train_metadata = (
        old_train_metadata.copy()
    )  # Keep 'categories', 'info', 'licenses', 'regions'
    new_train_metadata.update(
        {
            "annotations": [],
            "images": [],
        }
    )
    with tqdm(
        desc="Creating new train dataset",
        total=train_sample_count,
    ) as pbar:
        for category_id, annotations_images in train_annotations_images_by_category.items():
            # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03"
            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



mlebench/competitions/herbarium-2021-fgvc8/prepare.py [53:118]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        old_train_metadata = json.load(f)

    # Organize data by category so that we can split per-category later
    annotations_images_by_category = {}  # We'll collect both `annotations` and `images` here
    for annotation, image in list(
        zip(old_train_metadata["annotations"], old_train_metadata["images"])
    ):
        assert (
            annotation["image_id"] == image["id"]
        ), f"Mismatching image_id in annotation and image: {annotation['image_id']} vs {image['id']}"
        category_id = annotation["category_id"]
        if category_id not in annotations_images_by_category:
            annotations_images_by_category[category_id] = []
        annotations_images_by_category[category_id].append(
            {
                "annotation": annotation,
                "image": image,
            }
        )

    # Split train/test
    train_sample_count = 0  # Useful for tqdm later
    train_annotations_images_by_category = {}
    test_annotations_images_by_category = {}
    for category_id, annotations_images in tqdm(
        annotations_images_by_category.items(), desc="Assigning train/test splits"
    ):
        test_size = 0.2
        # Create split by "category" (class): Each category needs to be in both train and test (80:20) as per original ratio
        n_samples = len(annotations_images)
        if n_samples == 1:
            # If only one sample, put it in train
            train_annotations_images = annotations_images
            test_annotations_images = []
        elif n_samples < 5:  # Minimum 5 samples to ensure at least 1 in test
            # Ensure at least 1 sample in test
            test_size = max(1, int(n_samples * test_size))
            train_annotations_images = annotations_images[:-test_size]
            test_annotations_images = annotations_images[-test_size:]
        else:
            # Original split logic
            train_annotations_images, test_annotations_images = train_test_split(
                annotations_images, test_size=test_size, random_state=0
            )

        train_annotations_images_by_category[category_id] = train_annotations_images
        test_annotations_images_by_category[category_id] = test_annotations_images
        train_sample_count += len(train_annotations_images)

    # Add to train set
    new_train_metadata = (
        old_train_metadata.copy()
    )  # Keep 'categories', 'info', 'licenses', 'institutions'
    new_train_metadata.update(
        {
            "annotations": [],
            "images": [],
        }
    )
    with tqdm(
        desc="Creating new train dataset",
        total=train_sample_count,
    ) as pbar:
        for category_id, annotations_images in train_annotations_images_by_category.items():
            # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03"
            category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}"
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



