mlebench/competitions/uw-madison-gi-tract-image-segmentation/prepare.py (89 lines of code) (raw):

import shutil from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from tqdm.auto import tqdm from mlebench.utils import read_csv def verify_directory_sync(df: pd.DataFrame, expected_dir: Path, unexpected_dir: Path): """ Checks that the dataframe contents match the directory structure. """ for _, row in tqdm( df.iterrows(), desc=f"Verifying directory sync for {expected_dir.name}", total=len(df) ): case_day_path = expected_dir / row["case"] / f"{row['case']}_{row['day']}" assert ( case_day_path.exists() ), f"Directory {case_day_path} does not exist but is listed in the dataframe." non_existent_path = unexpected_dir / row["case"] / f"{row['case']}_{row['day']}" assert ( not non_existent_path.exists() ), f"Directory {non_existent_path} exists but is not listed in the dataframe." def prepare(raw: Path, public: Path, private: Path): old_train = read_csv(raw / "train.csv") # ----------------------- Splitting # Extract case and day from 'id' old_train["case"] = old_train["id"].apply(lambda x: x.split("_")[0]) old_train["day"] = old_train["id"].apply(lambda x: x.split("_")[1]) old_train["slice"] = old_train["id"].apply(lambda x: x.split("_")[-1]) # Split cases into train and test unique_cases = old_train["case"].unique() train_cases, test_cases = train_test_split(unique_cases, test_size=0.1, random_state=42) # Initially assign entire cases to train or test set old_train["set"] = old_train["case"].apply(lambda x: "test" if x in test_cases else "train") # Then mark some days from train to be test, to match competition test description days_df = old_train[old_train["set"] == "train"].groupby("case")["day"].apply(set).reset_index() for _, row in days_df.iterrows(): # if theres more than 4 days, we will move any days past the 4th to the test set days = row["day"] if len(days) > 4: days = sorted(days, key=lambda x: int(x[len("day") :])) days_to_move = days[4:] # change their set to "test" old_train.loc[ old_train["case"].eq(row["case"]) & old_train["day"].isin(days_to_move), "set" ] = "test" # ----------------------- Move the files to the correct new locations old_train_dir = raw / "train" new_train_dir = public / "train" new_test_dir = public / "test" # Create new directories if they don't exist new_train_dir.mkdir(parents=True, exist_ok=True) new_test_dir.mkdir(parents=True, exist_ok=True) # Move directories based on the set assignment for case in tqdm(unique_cases, desc="Splitting by case"): original_path = old_train_dir / case if case in train_cases: new_path = new_train_dir / case else: new_path = new_test_dir / case # new_path.mkdir(parents=True, exist_ok=True) shutil.copytree(original_path, new_path, dirs_exist_ok=True) # Move specific days from public/train/ to public/test/ for marked case-days for _, row in tqdm( old_train.iterrows(), desc="Handling additional day-based splits", total=len(old_train) ): if row["set"] == "test": source_day_path = new_train_dir / row["case"] / f"{row['case']}_{row['day']}" target_day_path = new_test_dir / row["case"] / f"{row['case']}_{row['day']}" if source_day_path.exists(): target_day_path.parent.mkdir(parents=True, exist_ok=True) shutil.move(source_day_path.as_posix(), target_day_path.as_posix()) # ------------------------ Saving splits new_train = old_train[old_train["set"] == "train"].copy() new_test = old_train[old_train["set"] == "test"].copy() # some asserts before we drop columns verify_directory_sync(new_train, expected_dir=new_train_dir, unexpected_dir=new_test_dir) verify_directory_sync(new_test, expected_dir=new_test_dir, unexpected_dir=new_train_dir) # get image height and image width for the test set, since this is needed for the metric for _, row in tqdm( new_test.iterrows(), desc="Getting image dimensions for test set", total=len(new_test) ): case, day, day_slice = row["case"], row["day"], row["slice"] image_paths = list( (old_train_dir / case / f"{case}_{day}" / "scans").glob(f"slice_{day_slice}_*.png") ) assert len(image_paths) == 1, f"Expected 1 image, found {len(image_paths)}" image_path = image_paths[0] width, height = (int(length) for length in image_path.stem.split("_")[2:4]) new_test.loc[row.name, "image_width"] = width new_test.loc[row.name, "image_height"] = height # dont need these anymore, and werent part of the original data new_train.drop(columns=["set", "case", "day", "slice"], inplace=True) new_test.drop(columns=["set", "case", "day", "slice"], inplace=True) # create sample submission sample_submission = new_test.copy() sample_submission["segmentation"] = "1 1 5 2" # these are just metadata for the private test set necessary for the metric sample_submission.drop(columns=["image_height", "image_width"], inplace=True) # rename 'segmentation' to 'predicted' to match kaggle.com sample_submission.rename(columns={"segmentation": "predicted"}, inplace=True) sample_submission.to_csv(public / "sample_submission.csv", index=False, na_rep="") # create private files # rename 'segmentation' to 'predicted' to match sample_submission format new_test.rename(columns={"segmentation": "predicted"}, inplace=True) new_test.to_csv(private / "test.csv", index=False, na_rep="") # create public files new_train.to_csv(public / "train.csv", index=False, na_rep="") # including this because we are converting this from code to csv competition # and we need to point the model to the ids it needs to produce labels for new_test_without_labels = new_test.drop(columns=["predicted", "image_width", "image_height"]) new_test_without_labels.to_csv(public / "test.csv", index=False, na_rep="") # ------------------------ checks assert new_test_without_labels.shape[1] == 2, "Public test should have 2 columns." assert new_train.shape[1] == 3, "Public train should have 3 columns." assert len(new_train) + len(new_test) == len( old_train ), "Train and test should sum up to the original data."