in mlebench/competitions/hubmap-kidney-segmentation/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
old_train = read_csv(raw / "train.csv")
old_dataset_info = read_csv(raw / "HuBMAP-20-dataset_information.csv")
new_train, new_test = train_test_split(old_train, train_size=12, test_size=3, random_state=0)
# dataset info doesnt have an id column, so quickly add it so that we can filter out old test
old_dataset_info["id"] = old_dataset_info["image_file"].str.replace(".tiff", "")
dataset_info = old_dataset_info[old_dataset_info["id"].isin(old_train["id"])]
# put height and width in new_test, for grading
new_test = new_test.merge(dataset_info[["id", "width_pixels", "height_pixels"]], on="id")
dataset_info = dataset_info.drop(columns=["id"], inplace=False)
dataset_info.to_csv(public / "HuBMAP-20-dataset_information.csv", index=False)
new_train.to_csv(public / "train.csv", index=False)
new_test.to_csv(private / "test.csv", index=False)
sample_submission = new_test[["id"]].copy()
sample_submission["predicted"] = ""
sample_submission.to_csv(public / "sample_submission.csv", index=False)
# basically the same as new_test but with a different column name
gold_submission = sample_submission.copy()
gold_submission["predicted"] = new_test["encoding"]
gold_submission.to_csv(private / "gold_submission.csv", index=False)
(public / "train").mkdir(parents=True, exist_ok=True)
for image_id in tqdm(new_train["id"], desc="Copying train images"):
shutil.copy(raw / "train" / f"{image_id}.tiff", public / "train" / f"{image_id}.tiff")
shutil.copy(raw / "train" / f"{image_id}.json", public / "train" / f"{image_id}.json")
shutil.copy(
raw / "train" / f"{image_id}-anatomical-structure.json",
public / "train" / f"{image_id}-anatomical-structure.json",
)
(public / "test").mkdir(parents=True, exist_ok=True)
for image_id in tqdm(new_test["id"], desc="Copying test images"):
shutil.copy(raw / "train" / f"{image_id}.tiff", public / "test" / f"{image_id}.tiff")
shutil.copy(raw / "train" / f"{image_id}.json", public / "test" / f"{image_id}.json")
shutil.copy(
raw / "train" / f"{image_id}-anatomical-structure.json",
public / "test" / f"{image_id}-anatomical-structure.json",
)
# for some reason sample_submission.csv is also in test/
shutil.copy(public / "sample_submission.csv", public / "test" / "sample_submission.csv")
# Checks
assert len(new_train) + len(new_test) == len(
old_train
), "Length of new_train and new_test should equal length of old_train"
assert new_train.columns.to_list() == [
"id",
"encoding",
], "Public train set should have 2 columns, called 'id' and 'encoding'"
assert new_test.columns.to_list() == [
"id",
"encoding",
"width_pixels",
"height_pixels",
], "Private test set should have 2 columns called 'id' and 'encoding'"
assert len(sample_submission) == len(new_test), "Sample submission length should match test set"
assert sample_submission.columns.to_list() == [
"id",
"predicted",
], "Sample submissions should have two columns, 'id' and 'predicted'"
assert len(gold_submission) == len(new_test), "Gold submission length should match test set"
assert gold_submission.columns.to_list() == [
"id",
"predicted",
], "Gold submissions should have two columns, 'id' and 'predicted'"
assert gold_submission["predicted"].equals(
new_test["encoding"]
), "Gold submission should match private test set"
# assert no overlap in ids between train and test
assert set(new_train["id"]).isdisjoint(
set(new_test["id"])
), "Train and test ids should not overlap"
# check that the images are copied correctly
assert len(list((public / "train").glob("*.tiff"))) == len(
new_train
), "Missing train tiff files"
assert len(list((public / "train").glob("*-anatomical-structure.json"))) == len(
new_train
), "Missing train structure json files"
assert (
len(list((public / "train").glob("*.json"))) == len(new_train) * 2
), "Missing train json files"
assert len(list((public / "test").glob("*.tiff"))) == len(new_test), "Missing test tiff files"
assert len(list((public / "test").glob("*-anatomical-structure.json"))) == len(
new_test
), "Missing test structure json files"