def prepare()

in mlebench/competitions/whale-categorization-playground/prepare.py [0:0]


def prepare(raw: Path, public: Path, private: Path):
    """
    Splits the data in raw into public and private datasets with appropriate test/train splits.
    """

    # Create train, test from train split
    old_train = read_csv(raw / "train.csv")

    old_train["split"] = "undecided"
    target_test_size = 0.1

    # seeded random generator for numpy
    np_rng = np.random.default_rng(0)

    # ensure each id occurs in train and test set at least once
    # when there's only one image for an id, goes randomly to train or test
    whale_ids = old_train["Id"].unique()
    for whale_id in whale_ids:
        whale_images = old_train[old_train["Id"] == whale_id]
        if len(whale_images) >= 2:
            # randomly assign one of these to train and one to test
            selected = whale_images.sample(2, random_state=0)
            old_train.loc[selected.index[0], "split"] = "train"
            old_train.loc[selected.index[1], "split"] = "test"
        else:
            # randomly assign this one image to train or test
            old_train.loc[whale_images.index[0], "split"] = np_rng.choice(
                ["train", "test"], replace=False, p=[1 - target_test_size, target_test_size]
            )

    # split the remaining data
    remaining_data = old_train[old_train["split"] == "undecided"]
    train, test = train_test_split(remaining_data, test_size=target_test_size, random_state=0)
    old_train.loc[train.index, "split"] = "train"
    old_train.loc[test.index, "split"] = "test"

    # finally, can split out into separate dataframes
    new_train = old_train[old_train["split"] == "train"].drop(columns=["split"]).copy()
    answers = old_train[old_train["split"] == "test"].drop(columns=["split"]).copy()

    # If a whale Id is only in the test set, it should be labeled as new_whale instead
    ids_in_test_but_not_train = set(answers["Id"]) - set(new_train["Id"])
    answers.loc[answers["Id"].isin(ids_in_test_but_not_train), "Id"] = "new_whale"

    # Create sample submission
    sample_submission = answers.copy()
    sample_submission["Id"] = "new_whale w_1287fbc w_98baff9 w_7554f44 w_1eafe46"

    # Checks
    assert len(answers) == len(
        sample_submission
    ), "Answers and sample submission should have the same length"
    assert new_train.shape[1] == 2, "Train should have exactly 2 columns"
    assert sample_submission.shape[1] == 2, "Sample submission should have exactly 2 columns"
    assert answers.shape[1] == 2, "Answers should have exactly 2 columns"
    assert (
        "new_whale" in answers["Id"].values
    ), "Answers should contain at least some values with 'new_whale' in the 'Id' column"
    assert len(new_train) + len(answers) == len(
        old_train
    ), "The combined length of new_train and answers should equal the length of old_train"

    # Write CSVs
    answers.to_csv(private / "test.csv", index=False)
    new_train.to_csv(public / "train.csv", index=False)
    sample_submission.to_csv(public / "sample_submission.csv", index=False)

    # Copy over files
    (public / "test").mkdir(exist_ok=True)
    (public / "train").mkdir(exist_ok=True)

    for file_id in tqdm(new_train["Image"], desc="Copying train images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}",
            dst=public / "train" / f"{file_id}",
        )

    for file_id in tqdm(answers["Image"], desc="Copying test images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}",
            dst=public / "test" / f"{file_id}",
        )

    # File checks
    train_files = list(public.glob("train/*.jpg"))
    test_files = list(public.glob("test/*.jpg"))
    assert len(train_files) == len(
        new_train
    ), "Train dir should have the same number of images as the length of train set"
    assert len(test_files) == len(
        answers
    ), "Test dir should have the same number of images as the length of test set"
    assert not set(train_files) & set(test_files), "Train and test files should be distinct"