def prepare()

in mlebench/competitions/rsna-2022-cervical-spine-fracture-detection/prepare.py [0:0]


def prepare(raw: Path, public: Path, private: Path):
    rng = random.Random(0)

    # there are two subsets of training data:

    # 1. one of instances that have bounding boxes
    # 2. one of instances that have segmentations

    # we need to preserve the ratios of the sizes of these subsets to the total train samples

    # additionally, there is an overlap between the two subsets
    # we need to preserve this overlap

    DEV = False
    old_train = read_csv(raw / "train.csv")

    num_old_train = len(old_train)
    if DEV:
        DEV_RATIO = 0.175
        num_old_train = round(DEV_RATIO * num_old_train)

    # 2019 train folders (StudyInstanceUIDs), 1500 test folders, 2019 / (1500 + 2019) ~ 0.60 original train ratio
    # each folder has ~ 300 images
    # We use 0.1 ratio to avoid taking too many samples out of train
    TRAIN_RATIO = 0.1
    num_train_samples = round(num_old_train * TRAIN_RATIO)

    # bboxes
    old_train_bboxes = read_csv(raw / "train_bounding_boxes.csv")
    if DEV:
        old_train_bboxes = old_train_bboxes.sample(frac=DEV_RATIO, random_state=0)

    old_train_bbox_ids = sorted(old_train_bboxes["StudyInstanceUID"].unique())
    old_num_train_bbox_ids = len(old_train_bbox_ids)  # 235
    new_num_train_bbox_ids = round(old_num_train_bbox_ids * TRAIN_RATIO)

    # segmentations
    old_train_segmentation_path = raw / "segmentations"
    old_train_segmentation_ids = sorted([f.stem for f in old_train_segmentation_path.glob("*.nii")])
    if DEV:
        old_train_segmentation_ids = rng.sample(
            old_train_segmentation_ids, round(DEV_RATIO * len(old_train_segmentation_ids))
        )
    old_num_train_segmentation_ids = len(old_train_segmentation_ids)  # 87
    new_num_train_segmentation_ids = round(old_num_train_segmentation_ids * TRAIN_RATIO)

    # overlap: list of StudyInstanceUIDs that have both bounding boxes and segmentations
    old_overlap_ids = [uid for uid in old_train_bbox_ids if uid in old_train_segmentation_ids]
    old_num_overlap = len(old_overlap_ids)  # 40
    new_num_overlap = round(old_num_overlap * TRAIN_RATIO)

    # start populating new train by picking the overlap instances
    # sample new_num_overlap instances from the overlap randomly
    new_overlap_ids = rng.sample(old_overlap_ids, new_num_overlap)
    new_bboxes_ids = new_overlap_ids.copy()
    new_segmentations_ids = new_overlap_ids.copy()
    new_train_ids = new_overlap_ids.copy()

    # add the `new_num_train_segmentation_ids - new_num_overlap`, that are not in the overlap
    additional_segmentation_ids = rng.sample(
        [uid for uid in old_train_segmentation_ids if uid not in old_overlap_ids],
        new_num_train_segmentation_ids - new_num_overlap,
    )
    new_segmentations_ids += additional_segmentation_ids
    new_train_ids += additional_segmentation_ids

    # add the (`new_num_train_bbox_ids - num_num_overlap`) segmentations, that are not in the overlap
    additional_bbox_ids = rng.sample(
        [uid for uid in old_train_bbox_ids if uid not in old_overlap_ids],
        new_num_train_bbox_ids - new_num_overlap,
    )
    new_bboxes_ids += additional_bbox_ids
    new_train_ids += additional_bbox_ids

    if DEV:
        # old train has whatever is currently in new_train_ids
        # + a random sample of the rest, s.t. its 15% of the original train
        dev_old_train_ids = new_train_ids + rng.sample(
            [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
            num_old_train - len(new_train_ids),
        )
        old_train = old_train[old_train["StudyInstanceUID"].isin(dev_old_train_ids)].copy()

    # then, fill the rest of the new train.
    new_train_ids += rng.sample(
        [uid for uid in old_train["StudyInstanceUID"] if uid not in new_train_ids],
        num_train_samples - len(new_train_ids),
    )

    train = old_train[old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
    train.to_csv(public / "train.csv", index=False)

    train_bboxes = old_train_bboxes[
        old_train_bboxes["StudyInstanceUID"].isin(new_bboxes_ids)
    ].copy()
    train_bboxes.to_csv(public / "train_bounding_boxes.csv", index=False)

    answers = old_train[~old_train["StudyInstanceUID"].isin(new_train_ids)].copy()
    # columns become rows for the test and sample submission, so also for answers
    answers = answers.melt(
        id_vars="StudyInstanceUID", var_name="prediction_type", value_name="fractured"
    )
    answers["row_id"] = answers["StudyInstanceUID"] + "_" + answers["prediction_type"]
    answers.to_csv(private / "answers.csv", index=False)

    sample_submission = answers[["row_id", "fractured"]].copy()
    sample_submission["fractured"] = 0.5
    sample_submission.to_csv(public / "sample_submission.csv", index=False)

    public_test = answers.drop(columns=["fractured"]).copy()
    public_test.to_csv(public / "test.csv", index=False)

    # assert that the melting worked
    assert answers["StudyInstanceUID"].nunique() * 8 == len(
        answers
    ), "Melting failed, incorrect length"
    assert answers.columns.tolist() == [
        "StudyInstanceUID",
        "prediction_type",
        "fractured",
        "row_id",
    ], "Melting went wrong, columns are wrong"

    # column checks
    train_cols = ["StudyInstanceUID", "patient_overall", "C1", "C2", "C3", "C4", "C5", "C6", "C7"]
    assert train.columns.tolist() == train_cols, "Train columns are wrong"
    bbox_cols = ["StudyInstanceUID", "x", "y", "width", "height", "slice_number"]
    assert train_bboxes.columns.tolist() == bbox_cols, "Bounding box columns are wrong"
    test_cols = ["StudyInstanceUID", "prediction_type", "row_id"]
    assert public_test.columns.tolist() == test_cols, "Test columns are wrong"
    submission_cols = ["row_id", "fractured"]
    assert sample_submission.columns.tolist() == submission_cols, "Submission columns are wrong"

    # Check that the correct number of training samples is selected
    assert len(new_train_ids) == round(len(old_train) * TRAIN_RATIO), (
        "Incorrect number of training samples."
        " The number of `new_train_ids` doesn't match the expected number given the `TRAIN_RATIO`."
    )
    assert len(train) + answers["StudyInstanceUID"].nunique() == len(old_train), (
        "Incorrect number of training samples."
        " New train and test splits don't sum to the length of the original train set."
    )

    # Check that the correct number of bounding box samples is selected
    assert len(new_bboxes_ids) == round(
        len(old_train_bbox_ids) * TRAIN_RATIO
    ), "Incorrect number of bounding box samples"

    # Check that the correct number of segmentation samples is selected
    assert len(new_segmentations_ids) == round(
        len(old_train_segmentation_ids) * TRAIN_RATIO
    ), "Incorrect number of segmentation samples"

    # Check that the overlap is preserved
    assert len(new_overlap_ids) == round(
        len(old_overlap_ids) * TRAIN_RATIO
    ), "Incorrect overlap preservation"

    # check that test and train dont share study instance ids
    assert set(train["StudyInstanceUID"]).isdisjoint(
        set(public_test["StudyInstanceUID"].unique())
    ), "Train and test share study instance ids"

    # Now that splitting is done, copy over images accordingly
    (public / "segmentations").mkdir(exist_ok=True)

    for file_id in tqdm(
        new_segmentations_ids, desc="Copying segmentations", total=len(new_segmentations_ids)
    ):
        shutil.copyfile(
            src=old_train_segmentation_path / f"{file_id}.nii",
            dst=public / "segmentations" / f"{file_id}.nii",
        )

    (public / "train_images").mkdir(exist_ok=True)
    for study_id in tqdm(
        train["StudyInstanceUID"],
        desc="Copying train images",
        total=len(train),
        unit="StudyInstance",
    ):
        shutil.copytree(
            src=raw / "train_images" / study_id,
            dst=public / "train_images" / study_id,
            dirs_exist_ok=True,
        )
    (public / "test_images").mkdir(exist_ok=True)
    for study_id in tqdm(
        public_test["StudyInstanceUID"].unique(),
        desc="Copying test images",
        total=public_test["StudyInstanceUID"].nunique(),
        unit="StudyInstance",
    ):
        shutil.copytree(
            src=raw / "train_images" / study_id,
            dst=public / "test_images" / study_id,
            dirs_exist_ok=True,
        )

    assert len(list((public / "segmentations").glob("*.nii"))) == len(
        new_segmentations_ids
    ), "Incorrect number of segmentations copied"
    # check that all the right image directories are copied
    assert len(list((public / "train_images").glob("*"))) == len(
        train
    ), "Incorrect number of train images copied"

    assert (
        len(list((public / "test_images").glob("*"))) == public_test["StudyInstanceUID"].nunique()
    ), "Incorrect number of test images copied"