def prepare()

in mlebench/competitions/kuzushiji-recognition/prepare.py [0:0]


def prepare(raw: Path, public: Path, private: Path):
    """
    Splits the data in raw into public and private datasets with appropriate test/train splits.
    """
    # Extract images so we can split the train images
    extract(raw / "train_images.zip", raw / "train")

    # Create train, test from train split
    old_train = read_csv(raw / "train.csv")

    new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)

    # Save the new train and test
    new_train.to_csv(public / "train.csv", index=False)
    new_test.to_csv(private / "test.csv", index=False)

    # Copy images
    (public / "train").mkdir(exist_ok=True)
    (public / "test").mkdir(exist_ok=True)

    for file_id in tqdm(new_train["image_id"], desc="Copying train images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "train" / f"{file_id}.jpg",
        )

    for file_id in tqdm(new_test["image_id"], desc="Copying test images"):
        shutil.copyfile(
            src=raw / "train" / f"{file_id}.jpg",
            dst=public / "test" / f"{file_id}.jpg",
        )

    assert len(list(public.glob("train/*.jpg"))) == len(new_train)
    assert len(list(public.glob("test/*.jpg"))) == len(new_test)
    assert len(new_train) + len(new_test) == len(old_train)

    # Create zips of the images
    logger.info("Re-zipping up new image directories...")
    shutil.make_archive(str(public / "train_images"), "zip", public / "train")
    shutil.make_archive(str(public / "test_images"), "zip", public / "test")
    # Remove the directories for consistency with the kaggle data
    shutil.rmtree(public / "train")
    shutil.rmtree(public / "test")

    # Copy unicode_translation
    shutil.copyfile(
        src=raw / "unicode_translation.csv",
        dst=public / "unicode_translation.csv",
    )

    assert (public / "train_images.zip").is_file()
    assert (public / "test_images.zip").is_file()
    assert (public / "unicode_translation.csv").is_file()

    # Make sample submission for new test set
    sample_submission = new_test.copy()
    # Same guess for all, as in original sample submission
    sample_submission["labels"] = "U+003F 1 1 U+FF2F 2 2"
    sample_submission.to_csv(public / "sample_submission.csv", index=False)

    # Make a gold submission in private/ (useful for testing)
    # This submission takes the "class x y w h" labels from test and converts them to
    # "class x+1 y+1" labels (the +1 moves the coord into the  bbox, so that the metric picks it up)
    submission_labels = []
    for label in new_test["labels"]:
        # Labels have the form "class x y w h class x y w h class x y w h ... "
        label = label.split()
        new_label = []
        assert len(label) % 5 == 0
        classes, xs, ys = label[0::5], label[1::5], label[2::5]
        # +1 to xs and ys
        xs = [str(int(x) + 1) for x in xs]
        ys = [str(int(y) + 1) for y in ys]
        new_label = [f"{c} {x} {y}" for c, x, y in zip(classes, xs, ys)]
        submission_labels.append(" ".join(new_label))
    gold_submission = new_test.copy()
    gold_submission["labels"] = submission_labels
    gold_submission.to_csv(private / "gold_submission.csv", index=False)

    assert len(gold_submission) == len(new_test)
    assert len(gold_submission) == len(sample_submission)