mlebench/competitions/herbarium-2020-fgvc7/prepare.py (176 lines of code) (raw):

import json import random import shutil from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from tqdm import tqdm from mlebench.utils import get_logger logger = get_logger(__name__) def prepare(raw: Path, public: Path, private: Path): """ Splits the raw data into public and private datasets with appropriate test/train splits. `train/metadata.json` is the "table of contents" for our data, with the following structure: (More details at https://www.kaggle.com/competitions/herbarium-2020-fgvc7/data) ``` { "annotations" : [annotation], "categories" : [category], "images" : [image], "info" : info, "licenses" : [license], "regions" : [region] } ``` - `images` and `annotations` are both N-length lists corresponding to the N samples. We'll need to split each of these lists into train and test. - The other fields are dataset-wide metadata that we don't need to touch. - test/metadata.json is the same structure as train/metadata.json, but without "annotations", "categories", "regions" Other notes: - train/test splits need to occur per category (each category should be in both train and test). - The `test/images` and `train/images` folders have nested subdirs to make it easier to browse - `train/images` is structured as `{category_id[:3]}/{category_id[3:]}/{image_id}.jpg` - `test/images` is structured as `{image_idx[:3]}/{image_idx}.jpg` (to not reveal the category) - When we create the new splits, we re-assign image indices so that we don't give away labels based on the index - train images are indexed within their own category - test images follow a flat index after shuffling the categories """ dev_mode = False dev_count = 2 # Copy over n images per category when in dev mode # Create train, test from train split json_path = raw / "nybg2020/train/metadata.json" with open(json_path, "r", encoding="latin-1") as f: # utf-8 fails old_train_metadata = json.load(f) # Organize data by category so that we can split per-category later annotations_images_by_category = {} # We'll collect both `annotations` and `images` here for annotation, image in list( zip(old_train_metadata["annotations"], old_train_metadata["images"]) ): assert ( annotation["image_id"] == image["id"] ), f"Mismatching image_id in annotation and image: {annotation['image_id']} vs {image['id']}" category_id = annotation["category_id"] if category_id not in annotations_images_by_category: annotations_images_by_category[category_id] = [] annotations_images_by_category[category_id].append( { "annotation": annotation, "image": image, } ) # Split train/test train_sample_count = 0 # Useful for tqdm later train_annotations_images_by_category = {} test_annotations_images_by_category = {} for category_id, annotations_images in tqdm( annotations_images_by_category.items(), desc="Assigning train/test splits" ): # Create split by "category" (class): Each category needs to be in both train and test (80:20) as per original ratio test_size = 0.2 n_samples = len(annotations_images) if n_samples == 1: # If only one sample, put it in train train_annotations_images = annotations_images test_annotations_images = [] elif n_samples < 5: # Minimum 5 samples to ensure at least 1 in test # Ensure at least 1 sample in test test_size = max(1, int(n_samples * test_size)) train_annotations_images = annotations_images[:-test_size] test_annotations_images = annotations_images[-test_size:] else: train_annotations_images, test_annotations_images = train_test_split( annotations_images, test_size=test_size, random_state=0 ) train_annotations_images_by_category[category_id] = train_annotations_images test_annotations_images_by_category[category_id] = test_annotations_images train_sample_count += len(train_annotations_images) # Add to train set new_train_metadata = ( old_train_metadata.copy() ) # Keep 'categories', 'info', 'licenses', 'regions' new_train_metadata.update( { "annotations": [], "images": [], } ) with tqdm( desc="Creating new train dataset", total=train_sample_count, ) as pbar: for category_id, annotations_images in train_annotations_images_by_category.items(): # Create a nested directory from category_id, e.g. 15504 -> "155/04" or 3 -> "000/03" category_subdir = f"{category_id // 100:03d}/{category_id % 100:02d}" (public / "nybg2020/train/images" / category_subdir).mkdir(exist_ok=True, parents=True) for idx, annotation_image in enumerate(annotations_images): new_annotation = annotation_image["annotation"].copy() new_train_metadata["annotations"].append(new_annotation) new_image = annotation_image["image"].copy() new_train_metadata["images"].append(new_image) # Copy file from raw to public if ( not dev_mode or idx < dev_count ): # if dev_mode, only copy the first dev_count images src_path = raw / "nybg2020/train" / annotation_image["image"]["file_name"] dst_path = public / "nybg2020/train" / annotation_image["image"]["file_name"] shutil.copyfile(src=src_path, dst=dst_path) pbar.update(1) with open(public / "nybg2020/train/metadata.json", "w") as f: json.dump(new_train_metadata, f, indent=4, sort_keys=True) if not dev_mode: assert len(list((public / "nybg2020/train/images").glob("**/*.jpg"))) == len( new_train_metadata["images"] ), f"Mismatching number of images in train_images, got {len(list((public / 'nybg2020/train/images').glob('**/*.jpg')))}" assert len(new_train_metadata["annotations"]) == len( new_train_metadata["images"] ), f"Mismatching number of annotations in train_metadata, got {len(new_train_metadata['annotations'])}" # Add to test set new_test_metadata = old_train_metadata.copy() del new_test_metadata["categories"] del new_test_metadata["regions"] new_test_metadata.update( { "annotations": [], "images": [], } ) # Flatten and shuffle test set so that we don't have all the same categories in a row test_annotations_images = [ item for sublist in test_annotations_images_by_category.values() for item in sublist ] random.Random(0).shuffle(test_annotations_images) for idx, annotation_image in tqdm( enumerate(test_annotations_images), desc="Creating new test dataset", total=len(test_annotations_images), ): # Make new image id, for test set this is just the index new_image_id = str(idx) # Make new filename from image id e.g. "000/0.jpg" new_file_name = f"images/{idx // 1000:03d}/{idx}.jpg" new_annotation = annotation_image["annotation"].copy() new_annotation["image_id"] = new_image_id new_test_metadata["annotations"].append(new_annotation) new_image = annotation_image["image"].copy() new_image["id"] = new_image_id new_image["file_name"] = new_file_name new_test_metadata["images"].append(new_image) # Copy file from raw to public if not dev_mode or idx < dev_count: # if dev_mode, only copy the first dev_count images src_path = raw / "nybg2020/train" / annotation_image["image"]["file_name"] dst_path = public / "nybg2020/test" / new_file_name dst_path.parent.mkdir(exist_ok=True, parents=True) shutil.copyfile(src=src_path, dst=dst_path) # Save new test metadata with open(public / "nybg2020/test/metadata.json", "w") as f: # The public test data, of course, doesn't have annotations public_new_test = new_test_metadata.copy() del public_new_test["annotations"] assert public_new_test.keys() == { "images", "info", "licenses", }, f"Public test metadata keys should be 'images', 'info', 'licenses', but found {public_new_test.keys()}" json.dump(public_new_test, f, indent=4, sort_keys=True) if not dev_mode: assert len(list((public / "nybg2020/test/images").glob("**/*.jpg"))) == len( new_test_metadata["images"] ), f"Mismatching number of images in test_images, got {len(list((public / 'nybg2020/test/images').glob('**/*.jpg')))}" assert len(new_test_metadata["annotations"]) == len( new_test_metadata["images"] ), f"Mismatching number of annotations in test_metadata, got {len(new_test_metadata['annotations'])}" assert len(new_train_metadata["annotations"]) + len( new_test_metadata["annotations"] ) == len(old_train_metadata["annotations"]), ( f"Expected {len(old_train_metadata['annotations'])} annotations in total, but found" f"{len(new_train_metadata['annotations'])} in train and {len(new_test_metadata['annotations'])} in test" ) # Save private test answers answers_rows = [] for image, annotation in zip(new_test_metadata["images"], new_test_metadata["annotations"]): assert ( image["id"] == annotation["image_id"] ), f"Mismatching image_id in image and annotation: {image['id']} vs {annotation['image_id']}" answers_rows.append( { "Id": image["id"], "Predicted": annotation["category_id"], } ) answers_df = pd.DataFrame(answers_rows) answers_df.to_csv(private / "answers.csv", index=False) # Create new sample submission that matches raw/sample_submission.csv, but for the new test set sample_rows = [] for image in new_test_metadata["images"]: sample_rows.append( { "Id": image["id"], "Predicted": 0, } ) sample_df = pd.DataFrame(sample_rows) sample_df.to_csv(public / "sample_submission.csv", index=False) assert len(answers_df) == len( new_test_metadata["images"] ), f"Expected {len(new_test_metadata['images'])} rows in answers, but found {len(answers_df)}" assert len(sample_df) == len( answers_df ), f"Expected {len(answers_df)} rows in sample submission, but found {len(sample_df)}" assert answers_df["Id"].equals( sample_df["Id"] ), "Mismatched 'Id' columns between answers and sample submission"