mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py (153 lines of code) (raw):
import random
import shutil
from pathlib import Path
from tqdm.auto import tqdm
from mlebench.utils import read_csv
def prepare(raw: Path, public: Path, private: Path):
rng = random.Random(0)
# there are two subsets of training data:
# 1. one of instances that have bounding boxes
# 2. one of instances that have segmentations
# we need to preserve the ratios of the sizes of these subsets to the total train samples
# additionally, there is an overlap between the two subsets
# we need to preserve this overlap
DEV = False
old_train = read_csv(raw / "train.csv")
num_old_train = len(old_train)
if DEV:
DEV_RATIO = 0.175
num_old_train = round(DEV_RATIO * num_old_train)
# 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
# each folder has ~ 300 images
# We use 0.1 ratio to avoid taking too many samples out of train
TRAIN_RATIO = 0.1
num_train_samples = round(num_old_train * TRAIN_RATIO)
# bboxes
old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
if DEV:
old_train_bboxes = old_train_bboxes.sample(frac=DEV_RATIO, random_state=0)
old_train_bbox_ids = sorted(old_train_bboxes["StudyInstanceUID"].unique())
old_num_train_bbox_ids = len(old_train_bbox_ids) # 235
new_num_train_bbox_ids = round(old_num_train_bbox_ids * TRAIN_RATIO)
# segmentations
old_train_segmentation_path = raw / "segmentations"
old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
if DEV:
old_train_segmentation_ids = rng.sample(
old_train_segmentation_ids, round(DEV_RATIO * len(old_train_segmentation_ids))
)
old_num_train_segmentation_ids = len(old_train_segmentation_ids) # 87
new_num_train_segmentation_ids = round(old_num_train_segmentation_ids * TRAIN_RATIO)
# overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
old_overlap_ids = [uid for uid in old_train_bbox_ids if uid in old_train_segmentation_ids]
old_num_overlap = len(old_overlap_ids) # 40
new_num_overlap = round(old_num_overlap * TRAIN_RATIO)
# start populating new train by picking the overlap instances
# sample new_num_overlap instances from the overlap randomly
new_overlap_ids = rng.sample(old_overlap_ids, new_num_overlap)
new_bboxes_ids = new_overlap_ids.copy()
new_segmentations_ids = new_overlap_ids.copy()
new_train_ids = new_overlap_ids.copy()
# add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
additional_segmentation_ids = rng.sample(
[uid for uid in old_train_segmentation_ids if uid not in old_overlap_ids],
new_num_train_segmentation_ids - new_num_overlap,
)
new_segmentations_ids += additional_segmentation_ids
new_train_ids += additional_segmentation_ids
# add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
additional_bbox_ids = rng.sample(
[uid for uid in old_train_bbox_ids if uid not in old_overlap_ids],
new_num_train_bbox_ids - new_num_overlap,
)
new_bboxes_ids += additional_bbox_ids
new_train_ids += additional_bbox_ids
if DEV:
# old train has whatever is currently in new_train_ids
# + a random sample of the rest, s.t. its 15% of the original train
dev_old_train_ids = new_train_ids + rng.sample(
[uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
num_old_train - len(new_train_ids),
)
old_train = old_train[old_train["StudyInstanceUID"].isin(dev_old_train_ids)].copy()
# then, fill the rest of the new train.
new_train_ids += rng.sample(
[uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
num_train_samples - len(new_train_ids),
)
train = old_train[old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
train.to_csv(public / "train.csv", index=False)
train_bboxes = old_train_bboxes[
old_train_bboxes["StudyInstanceUID"].isin(new_bboxes_ids)
].copy()
train_bboxes.to_csv(public / "train_bounding_boxes.csv", index=False)
answers = old_train[~old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
# columns become rows for the test and sample submission, so also for answers
answers = answers.melt(
id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
)
answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
answers.to_csv(private / "answers.csv", index=False)
sample_submission = answers[["row_id", "fractured"]].copy()
sample_submission["fractured"] = 0.5
sample_submission.to_csv(public / "sample_submission.csv", index=False)
public_test = answers.drop(columns=["fractured"]).copy()
public_test.to_csv(public / "test.csv", index=False)
# assert that the melting worked
assert answers["StudyInstanceUID"].nunique() * 8 == len(
answers
), "Melting failed, incorrect length"
assert answers.columns.tolist() == [
"StudyInstanceUID",
"prediction_type",
"fractured",
"row_id",
], "Melting went wrong, columns are wrong"
# column checks
train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
assert train.columns.tolist() == train_cols, "Train columns are wrong"
bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
submission_cols = ["row_id", "fractured"]
assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"
# Check that the correct number of training samples is selected
assert len(new_train_ids) == round(len(old_train) * TRAIN_RATIO), (
"Incorrect number of training samples."
" The number of `new_train_ids` doesn't match the expected number given the `TRAIN_RATIO`."
)
assert len(train) + answers["StudyInstanceUID"].nunique() == len(old_train), (
"Incorrect number of training samples."
" New train and test splits don't sum to the length of the original train set."
)
# Check that the correct number of bounding box samples is selected
assert len(new_bboxes_ids) == round(
len(old_train_bbox_ids) * TRAIN_RATIO
), "Incorrect number of bounding box samples"
# Check that the correct number of segmentation samples is selected
assert len(new_segmentations_ids) == round(
len(old_train_segmentation_ids) * TRAIN_RATIO
), "Incorrect number of segmentation samples"
# Check that the overlap is preserved
assert len(new_overlap_ids) == round(
len(old_overlap_ids) * TRAIN_RATIO
), "Incorrect overlap preservation"
# check that test and train dont share study instance ids
assert set(train["StudyInstanceUID"]).isdisjoint(
set(public_test["StudyInstanceUID"].unique())
), "Train and test share study instance ids"
# Now that splitting is done, copy over images accordingly
(public / "segmentations").mkdir(exist_ok=True)
for file_id in tqdm(
new_segmentations_ids, desc="Copying segmentations", total=len(new_segmentations_ids)
):
shutil.copyfile(
src=old_train_segmentation_path / f"{file_id}.nii",
dst=public / "segmentations" / f"{file_id}.nii",
)
(public / "train_images").mkdir(exist_ok=True)
for study_id in tqdm(
train["StudyInstanceUID"],
desc="Copying train images",
total=len(train),
unit="StudyInstance",
):
shutil.copytree(
src=raw / "train_images" / study_id,
dst=public / "train_images" / study_id,
dirs_exist_ok=True,
)
(public / "test_images").mkdir(exist_ok=True)
for study_id in tqdm(
public_test["StudyInstanceUID"].unique(),
desc="Copying test images",
total=public_test["StudyInstanceUID"].nunique(),
unit="StudyInstance",
):
shutil.copytree(
src=raw / "train_images" / study_id,
dst=public / "test_images" / study_id,
dirs_exist_ok=True,
)
assert len(list((public / "segmentations").glob("*.nii"))) == len(
new_segmentations_ids
), "Incorrect number of segmentations copied"
# check that all the right image directories are copied
assert len(list((public / "train_images").glob("*"))) == len(
train
), "Incorrect number of train images copied"
assert (
len(list((public / "test_images").glob("*"))) == public_test["StudyInstanceUID"].nunique()
), "Incorrect number of test images copied"