def prepare()

in mlebench/competitions/invasive-species-monitoring/prepare.py [0:0]


def prepare(raw: Path, public: Path, private: Path):
    """
    Splits the data in raw into public and private datasets with appropriate test/train splits.
    """
    # extract only what we need
    extract(raw / "train.7z", raw)
    extract(raw / "train_labels.csv.zip", raw)

    # Create train, test from train split
    # Original ratio is 1531/(1531+2295) = 0.4
    test_ratio = 0.2
    old_train = read_csv(raw / "train_labels.csv")
    new_train, answers = train_test_split(old_train, test_size=test_ratio, random_state=0)

    # Sample submission
    sample_submission = answers.copy()
    sample_submission["invasive"] = 0.5

    # Checks
    assert new_train["name"].is_unique, "new_train should have unique names"
    assert answers["name"].is_unique, "answers should have unique names"
    assert set(new_train["name"]).isdisjoint(
        set(answers["name"])
    ), "new_train and answers should be disjoint"
    assert len(new_train) + len(answers) == len(
        old_train
    ), "new_train and answers together should have the same number of rows as old_train"
    assert (
        new_train.columns.tolist() == old_train.columns.tolist()
    ), "new_train should have the same columns as old_train"
    assert (
        answers.columns.tolist() == old_train.columns.tolist()
    ), "answers should have the same columns as old_train"
    assert (
        sample_submission.columns.tolist() == old_train.columns.tolist()
    ), "sample_submission should have the same columns as old_train"

    # Write CSVs
    answers.to_csv(private / "answers.csv", index=False)
    new_train.to_csv(public / "train_labels.csv", index=False)
    sample_submission.to_csv(private / "sample_submission.csv", index=False)
    sample_submission.to_csv(public / "sample_submission.csv", index=False)

    # Copy files
    (public / "train").mkdir(exist_ok=True)
    (public / "test").mkdir(exist_ok=True)

    for file_id in tqdm(new_train["name"], desc="Copying Train Images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "train" / f"{file_id}.jpg",
        )

    for file_id in tqdm(answers["name"], desc="Copying Test Images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "test" / f"{file_id}.jpg",
        )

    # Checks
    assert len(list((public / "train").glob("*.jpg"))) == len(
        new_train
    ), "public/train should have the same number of files as new_train"
    assert len(list((public / "test").glob("*.jpg"))) == len(
        answers
    ), "public/test should have the same number of files as answers"

    # Zip
    shutil.make_archive(
        str(public / "sample_submission.csv"),
        "zip",
        root_dir=public,
        base_dir="sample_submission.csv",
    )
    shutil.make_archive(
        str(public / "train_labels.csv"), "zip", root_dir=public, base_dir="train_labels.csv"
    )
    with py7zr.SevenZipFile(public / "train.7z", "w") as z:
        z.write(public / "train")
    with py7zr.SevenZipFile(public / "test.7z", "w") as z:
        z.write(public / "test")

    # Delete
    shutil.rmtree(public / "train")
    shutil.rmtree(public / "test")
    (public / "sample_submission.csv").unlink()
    (public / "train_labels.csv").unlink()