mlebench/competitions/dog-breed-identification/prepare.py (34 lines of code) (raw):

import shutil from pathlib import Path from sklearn.model_selection import train_test_split from mlebench.competitions.utils import df_to_one_hot from mlebench.utils import read_csv from .dogs import DOGS_LIST def to_one_hot(df): return df_to_one_hot(df, id_column="id", target_column="breed", classes=DOGS_LIST) def prepare(raw: Path, public: Path, private: Path): # Create train, test from train split old_train = read_csv(raw / "labels.csv") new_train, new_test = train_test_split(old_train, test_size=0.1, random_state=0) # one hot the private test because we will one-hot the submission, as per kaggle.com new_test = to_one_hot(new_test) # Copy over files new_train.to_csv(public / "labels.csv", index=False) new_test.to_csv(private / "test.csv", index=False) (public / "train").mkdir(exist_ok=True) for file_id in new_train["id"]: shutil.copyfile( src=raw / "train" / f"{file_id}.jpg", dst=public / "train" / f"{file_id}.jpg", ) (public / "test").mkdir(exist_ok=True) for file_id in new_test["id"]: shutil.copyfile( src=raw / "train" / f"{file_id}.jpg", dst=public / "test" / f"{file_id}.jpg", ) # Check integrity of the files copied assert len(list(public.glob("train/*.jpg"))) == len(new_train) assert len(list(public.glob("test/*.jpg"))) == len(new_test) # Create a sample submission file submission_df = new_test.copy() for col in submission_df.columns[1:]: submission_df[col] = submission_df[col].astype("float64") submission_df.iloc[:, 1:] = 1 / 120 submission_df.to_csv(public / "sample_submission.csv", index=False) assert submission_df.shape == (len(new_test), 121) # 1 id column + 120 breeds