def prepare()

in mlebench/competitions/iwildcam-2020-fgvc7/prepare.py [0:0]
103 lines of code
13 McCabe index (conditional complexity)

def prepare(raw: Path, public: Path, private: Path):
    """
    Splits the data in raw into public and private datasets with appropriate test/train splits.
    """

    dev_mode = False

    # Create train, test from train split

    # Load old train
    with open(raw / "iwildcam2020_train_annotations.json", "r") as file:
        old_train_json = json.load(file)
    old_train_annotations = pd.DataFrame(old_train_json["annotations"])
    old_train_images = pd.DataFrame(old_train_json["images"])
    old_train_categories = pd.DataFrame(old_train_json["categories"])
    # old_train_info = pd.DataFrame(old_train_json["info"])

    # Load old test
    with open(raw / "iwildcam2020_test_information.json", "r") as file:
        old_test_json = json.load(file)
    old_test_categories = pd.DataFrame(old_test_json["categories"])

    # Create splits based on train's images' on 'location'
    test_size = 0.22  # 62894/(217959+62894) = 0.22
    train_image_locations = old_train_images["location"].unique()
    locations_new_train, locations_new_test = train_test_split(
        train_image_locations, test_size=test_size, random_state=0
    )

    # Filter old train to new train and new test based on location
    new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
    new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]

    # Adjust the split to ensure around test_size of total samples are in the new test set
    while len(new_test_images) / (len(old_train_images) + len(new_test_images)) < test_size:
        # Move some locations from train to test
        location_to_move = locations_new_train[-1]
        locations_new_train = locations_new_train[:-1]
        locations_new_test = np.append(locations_new_test, location_to_move)
        new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
        new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]

    while len(new_test_images) / (len(old_train_images) + len(new_test_images)) > test_size:
        # Move some locations from test to train
        location_to_move = locations_new_test[-1]
        locations_new_test = locations_new_test[:-1]
        locations_new_train = np.append(locations_new_train, location_to_move)
        new_train_images = old_train_images[old_train_images["location"].isin(locations_new_train)]
        new_test_images = old_train_images[old_train_images["location"].isin(locations_new_test)]

    # Get the image ids for new train and new test
    new_train_ids = new_train_images["id"].unique()
    new_test_ids = new_test_images["id"].unique()

    # Filter annotations based on new train and new test image ids
    new_train_annotations = old_train_annotations[
        old_train_annotations["image_id"].isin(new_train_ids)
    ]
    new_test_annotations = old_train_annotations[
        old_train_annotations["image_id"].isin(new_test_ids)
    ]
    new_train_categories = old_train_categories.copy()
    new_test_categories = old_test_categories.copy()

    # Answers
    answer_annotations = new_test_annotations[["image_id", "category_id"]].copy()
    answer_annotations.rename(columns={"image_id": "Id", "category_id": "Category"}, inplace=True)

    # Create a sample submission file
    sample_submission = answer_annotations.copy()
    np.random.seed(0)
    sample_submission["Category"] = np.random.randint(
        0, 676, size=len(sample_submission)
    )  # Uniform between 0 and 675

    # Checks
    assert set(new_train_annotations["image_id"]).isdisjoint(
        set(new_test_images["id"])
    ), "Train should not contain annotations of test images"
    assert len(new_train_ids) + len(new_test_ids) == len(
        old_train_images["id"]
    ), "The combined length of new_train_ids and new_test_ids should equal the length of old_train_images"
    # Assert that new_train_images and new_test_images have disjoint locations
    assert set(new_train_images["location"]).isdisjoint(
        set(new_test_images["location"])
    ), "Train and test images should not share locations"

    # Reform JSON files
    new_train_json = {
        "annotations": new_train_annotations.to_dict(orient="records"),
        "images": new_train_images.to_dict(orient="records"),
        "categories": new_train_categories.to_dict(orient="records"),
        "info": old_train_json["info"],
    }

    new_test_json = {
        "images": new_test_images.to_dict(orient="records"),
        "categories": new_test_categories.to_dict(orient="records"),
        "info": old_test_json["info"],
    }

    # Checks on JSONs
    assert (
        new_train_json.keys() == old_train_json.keys()
    ), "new_train_json and old_train_json should have the same keys"
    assert (
        new_test_json.keys() == old_test_json.keys()
    ), "new_test_json and old_test_json should have the same keys"

    # Write files
    answer_annotations.to_csv(private / "answers.csv", index=False)
    sample_submission.to_csv(public / "sample_submission.csv", index=False)
    with open(public / "iwildcam2020_train_annotations.json", "w") as f:
        json.dump(new_train_json, f)
    with open(public / "iwildcam2020_test_information.json", "w") as f:
        json.dump(new_test_json, f)

    # Copy over megadetector results
    shutil.copyfile(
        raw / "iwildcam2020_megadetector_results.json",
        public / "iwildcam2020_megadetector_results.json",
    )

    # Reduce the number of images copied over to 100 for dev mode
    if dev_mode:
        new_train_ids = new_train_ids[:100]
        new_test_ids = new_test_ids[:100]

    # Copy over image files
    (public / "train").mkdir(exist_ok=True)
    (public / "test").mkdir(exist_ok=True)

    for file_id in tqdm(new_train_ids, desc="Copying train images", unit="file"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "train" / f"{file_id}.jpg",
        )

    for file_id in tqdm(new_test_ids, desc="Copying test images", unit="file"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "test" / f"{file_id}.jpg",
        )

    # Check integrity of the files copied
    assert len(list(public.glob("train/*.jpg"))) == len(
        new_train_ids
    ), "Number of train images should be equal to the number of unique image_id in the train set"
    assert len(list(public.glob("test/*.jpg"))) == len(
        new_test_ids
    ), "Number of test images should be equal to the number of unique image_id in the test set"