mlebench/competitions/vinbigdata-chest-xray-abnormalities-detection/prepare.py (86 lines of code) (raw):
import shutil
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from mlebench.utils import read_csv
def prepare(raw: Path, public: Path, private: Path):
"""
Splits the data in raw into public and private datasets with appropriate test/train splits.
"""
dev = False
# Create train, test from train split
old_train = read_csv(raw / "train.csv")
unique_image_ids = old_train["image_id"].unique()
# Original train has 15k images, original test has 3k images
# Our new train will have 13.5k images, our new test will have 1.5k images
expected_train_size = 13500
expected_test_size = 1500
train_image_ids, test_image_ids = train_test_split(
unique_image_ids, test_size=0.1, random_state=0
)
new_train = old_train[old_train["image_id"].isin(train_image_ids)]
answers = old_train[old_train["image_id"].isin(test_image_ids)]
# Create sample submission
sample_submission = pd.DataFrame(
{
"image_id": test_image_ids,
"PredictionString": "14 1 0 0 1 1",
} # As per the original sample submission
)
# Checks
assert (
len(set(new_train["image_id"])) == expected_train_size
), f"Expected {expected_train_size} train image_ids, got {len(set(new_train['image_id']))}"
assert (
len(set(answers["image_id"])) == expected_test_size
), f"Expected {expected_test_size} test image_ids, got {len(set(answers['image_id']))}"
assert set(new_train["image_id"]).isdisjoint(
set(answers["image_id"])
), f"image_id is not disjoint between train and test sets"
assert (
new_train.columns.tolist() == old_train.columns.tolist()
), f"Columns of new train and old train are not the same: {new_train.columns.tolist()} vs {old_train.columns.tolist()}"
assert len(new_train) + len(answers) == len(
old_train
), f"Length of new train and answers should add up to the length of old train, got {len(new_train) + len(answers)} vs {len(old_train)}"
assert len(sample_submission) == len(
set(answers["image_id"])
), f"Length of sample submission should be equal to the number of unique image_ids in answers, got {len(sample_submission)} vs {len(set(answers['image_id']))}"
# Reformat answers
def _get_consensus_annotation(answers, inspect_duplicates=False):
"""
In the original train, there can be multiple annotations of the same image_id, class_id pair. (Different radiologists draw the bounding boxes differently for the same finding)
In the original test, there is only one annotation per image_id, class_id pair. The original test set is labeled by consensus of 5 radiologists. (Source: https://www.kaggle.com/competitions/vinbigdata-chest-xray-abnormalities-detection/discussion/207969#1134645)
We simulate consensus by taking the first annotation for each image_id, class_id pair.
"""
if inspect_duplicates:
duplicates = answers[answers.duplicated(subset=["image_id", "class_id"], keep=False)]
duplicates = duplicates.sort_values(by=["image_id", "class_id"])
duplicates.to_csv("duplicates.csv", index=False)
answers = answers.groupby(by=["image_id", "class_id"]).first().reset_index()
return answers
answers = _get_consensus_annotation(answers)
# Filling in missing values for when there is no finding (class_id = 14)
answers = answers.fillna(0)
answers.loc[answers["class_id"] == 14, ["x_max", "y_max"]] = 1.0
# Create gold submission
gold = answers[["image_id", "class_id", "x_min", "y_min", "x_max", "y_max"]].copy()
# Create individual prediction strings
gold.loc[:, "PredictionString"] = gold.apply(
lambda row: f"{row['class_id']} 1.0 {row['x_min']} {row['y_min']} {row['x_max']} {row['y_max']}",
axis=1, # 1.0 is the confidence score
)
# Group by image_id and concatenate prediction strings
gold = gold.groupby("image_id")["PredictionString"].agg(" ".join).reset_index()
gold = gold.reset_index(drop=True)
assert len(gold) == len(
set(answers["image_id"])
), f"Length of gold should be equal to the number of unique image_ids in answers, got {len(gold)} vs {len(set(answers['image_id']))}"
# Write CSVs
new_train.to_csv(public / "train.csv", index=False)
sample_submission.to_csv(public / "sample_submission.csv", index=False)
answers.to_csv(private / "answers.csv", index=False)
gold.to_csv(private / "gold_submission.csv", index=False)
# Copy over files
(public / "test").mkdir(exist_ok=True)
(public / "train").mkdir(exist_ok=True)
if dev == True:
train_image_ids = train_image_ids[:10]
test_image_ids = test_image_ids[:10]
for file_id in tqdm(train_image_ids, desc="Copying train files"):
shutil.copyfile(
src=raw / "train" / f"{file_id}.dicom",
dst=public / "train" / f"{file_id}.dicom",
)
for file_id in tqdm(test_image_ids, desc="Copying test files"):
shutil.copyfile(
src=raw / "train" / f"{file_id}.dicom",
dst=public / "test" / f"{file_id}.dicom",
)
# Check files
assert len(list(public.glob("train/*.dicom"))) == len(
train_image_ids
), f"Expected {len(train_image_ids)} train files, got {len(list(public.glob('train/*.dicom')))}"
assert len(list(public.glob("test/*.dicom"))) == len(
test_image_ids
), f"Expected {len(test_image_ids)} test files, got {len(list(public.glob('test/*.dicom')))}"