in mlebench/competitions/petfinder-pawpularity-score/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
old_train = read_csv(raw / "train.csv")
np_rng = np.random.default_rng(0)
# Original ratio: 6800/(9912 + 6800) = ~ 0.41 test_size
# We use 0.1 ratio to avoid taking out too many samples from train
new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0)
new_test_without_labels = new_test.drop(columns=["Pawpularity"])
# random floats between 1 and 100 inclusive, with 2 decimal places
sample_submission = new_test[["Id", "Pawpularity"]].copy()
sample_submission["Pawpularity"] = np_rng.uniform(1, 100, len(sample_submission)).round(2)
new_train.to_csv(public / "train.csv", index=False)
new_test.to_csv(private / "test.csv", index=False)
new_test_without_labels.to_csv(public / "test.csv", index=False)
sample_submission.to_csv(public / "sample_submission.csv", index=False)
(public / "train").mkdir(exist_ok=True)
for img_id in tqdm(new_train["Id"], desc="Copying train images", total=len(new_train)):
shutil.copy(raw / "train" / f"{img_id}.jpg", public / "train" / f"{img_id}.jpg")
(public / "test").mkdir(exist_ok=True)
for img_id in tqdm(
new_test_without_labels["Id"],
desc="Copying test images",
total=len(new_test_without_labels),
):
shutil.copy(raw / "train" / f"{img_id}.jpg", public / "test" / f"{img_id}.jpg")
# checks
assert len(new_train) + len(new_test) == len(
old_train
), "Train and test length should sum to the original train length"
assert len(sample_submission) == len(
new_test
), "Sample submission should have the same length as the test set"
assert (
new_train.columns.tolist() == old_train.columns.tolist()
), "Old and new train columns should match"
assert (
new_test_without_labels.columns.tolist() == new_train.columns.tolist()[:-1]
), "Public test columns should match train columns, minus the target column"
assert (
new_test.columns.tolist() == new_train.columns.tolist()
), "Private test columns should match train columns"
assert sample_submission.columns.tolist() == [
"Id",
"Pawpularity",
], "Sample submission columns should be Id, Pawpularity"
assert set(new_train["Id"]).isdisjoint(
set(new_test["Id"])
), "Train and test ids should not overlap"
# check copy was successful
assert len(list((public / "train").glob("*.jpg"))) == len(
new_train
), "Train images should match the train set"
assert len(list((public / "test").glob("*.jpg"))) == len(
new_test
), "Test images should match the test set"