in mlebench/competitions/rsna-breast-cancer-detection/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
old_train = read_csv(raw / "train.csv")
# work on 5k samples for now, instead of 54k
if DEV:
old_train = old_train.sample(5000, random_state=42)
# "You can expect roughly 8,000 patients" in the test set
# so, split on patients. There are 11913 patients in train set
patient_ids = old_train["patient_id"].unique()
# Original ratio is 8000/ (8000 + 11913) ~ 0.4
# We use 0.1 to avoid taking too many samples out of train
train_patients, test_patients = train_test_split(patient_ids, test_size=0.1, random_state=42)
train_patients = set(train_patients)
test_patients = set(test_patients)
new_train = old_train[old_train["patient_id"].isin(train_patients)].copy()
# dont index the `prediction_id` (last col) since its not in train and we need to build it
# index also the `cancer` column, which we'll drop later for the without_labels version
answers = old_train[old_train["patient_id"].isin(test_patients)][
test_columns[:-1] + ["cancer"]
].copy()
new_train.to_csv(public / "train.csv", index=False)
answers["prediction_id"] = answers["patient_id"].astype(str) + "_" + answers["laterality"]
new_test_without_labels = answers.drop(columns=["cancer"])
new_test_without_labels.to_csv(public / "test.csv", index=False)
# merge multiple prediction_ids for the same patient into one for sample_submission and private test
answers = answers[submission_columns].copy()
# just take the first label for each prediction id -- the rest will be identical duplicates
answers = answers.groupby("prediction_id").first().reset_index()
answers.to_csv(private / "answers.csv", index=False)
sample_submission = answers.copy()
sample_submission["cancer"] = new_train.cancer.mean() # mean cancer rate in train set
sample_submission.to_csv(public / "sample_submission.csv", index=False)
assert len(sample_submission) == len(
answers
), "sample_submission and test.csv should have the same number of rows"
assert len(new_test_without_labels) + len(new_train) == len(
old_train
), "The sum of the rows in new_test_without_labels and new_train should be equal to the number of rows in old_train"
# because of the merging
assert len(answers) != len(
new_test_without_labels
), "new_test and new_test_without_labels should have different number of rows"
assert (
answers.columns.tolist() == submission_columns
), f"answers should have columns {submission_columns}"
assert (
sample_submission.columns.tolist() == submission_columns
), f"sample_submission should have columns {submission_columns}"
assert (
new_train.columns.tolist() == old_train.columns.tolist()
), f"new_train should have columns {old_train.columns.tolist()}, got {new_train.columns.tolist()}"
assert (
new_test_without_labels.columns.tolist() == test_columns
), f"new_test_without_labels should have columns {test_columns}, got {new_test_without_labels.columns.tolist()}"
assert set(new_test_without_labels["patient_id"]).isdisjoint(
set(new_train["patient_id"])
), "new_test_without_labels and new_train should have disjoint patient_ids"
# finally, split the images
(public / "train_images").mkdir(exist_ok=True)
for patient_id in tqdm(train_patients, total=len(train_patients)):
patient_id_str = str(patient_id)
patient_dir = public / "train_images" / patient_id_str
patient_dir.mkdir(exist_ok=True)
image_ids = new_train[new_train["patient_id"] == patient_id]["image_id"].to_list()
for image_id in image_ids:
shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
(public / "test_images").mkdir(exist_ok=True)
for patient_id in tqdm(test_patients, total=len(test_patients)):
patient_id_str = str(patient_id)
patient_dir = public / "test_images" / patient_id_str
patient_dir.mkdir(exist_ok=True)
image_ids = new_test_without_labels[new_test_without_labels["patient_id"] == patient_id][
"image_id"
].to_list()
for image_id in image_ids:
shutil.copy(raw / "train_images" / patient_id_str / f"{image_id}.dcm", patient_dir)
# final checks
assert len(list((public / "train_images").rglob("*.dcm"))) == len(
new_train
), "Number of images in train_images should be equal to the number of rows in new_train"
assert len(list((public / "test_images").rglob("*.dcm"))) == len(
new_test_without_labels
), "Number of images in test_images should be equal to the number of rows in new_test_without_labels"