mlebench/competitions/vesuvius-challenge-ink-detection/prepare.py (89 lines of code) (raw):
import shutil
from pathlib import Path
import pandas as pd
from PIL import Image
from tqdm import tqdm
from mlebench.utils import read_csv
def prepare(raw: Path, public: Path, private: Path) -> None:
# Copy train images to `public/train/{1,2}/`
shutil.copytree(
src=raw / "train" / "1",
dst=public / "train" / "1",
)
shutil.copytree(
src=raw / "train" / "2",
dst=public / "train" / "2",
)
# Create test `inklabels_rle.csv`
inklabels_rle = read_csv(raw / "train" / "3" / "inklabels_rle.csv")
assert (
len(inklabels_rle) == 1
), f"Expected a single row in `inklabels_rle.csv`, got {len(inklabels_rle)} rows."
img_path = raw / "train" / "3" / "ir.png"
assert img_path.is_file(), f"Expected image file at {img_path}, but it does not exist."
with Image.open(img_path) as img:
width, height = img.size
inklabels_rle["width"] = width
inklabels_rle["height"] = height
inklabels_rle["Id"] = "a"
inklabels_rle.to_csv(private / "inklabels_rle.csv", index=False)
# Write `gold_submission.csv`
inklabels_rle.drop(columns=["width", "height"]).to_csv(
private / "gold_submission.csv",
index=False,
)
# Copy test images to `{public,private}/test/a/`
test_imgs = list((raw / "train" / "3").rglob("*"))
for fpath in tqdm(test_imgs, desc="Creating test images"):
if not fpath.is_file():
continue
assert fpath.suffix in [
".png",
".csv",
".tif",
], f"Expected file with extension png, csv, or tif, got `{fpath.suffix}` for file `{fpath}`"
relative_path = fpath.relative_to(raw / "train" / "3")
if fpath.name in ["inklabels.png", "inklabels_rle.csv", "ir.png"]:
continue # skip test images and labels
dst = public / "test" / "a" / relative_path
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(fpath, dst) # everything else to `public`
sample_submission = pd.DataFrame({"Id": ["a"], "Predicted": ["1 1 5 1"]})
sample_submission.to_csv(public / "sample_submission.csv", index=False)
# Sanity checks
assert len(sample_submission) == len(inklabels_rle), (
f"Expected {len(inklabels_rle)} rows in `sample_submission.csv`, got "
f"{len(sample_submission)} rows."
)
actual_sample_submission = read_csv(public / "sample_submission.csv")
actual_inklabels_rle = read_csv(private / "inklabels_rle.csv")
assert (
"Id" in actual_sample_submission.columns
), f"Expected column `Id` in `sample_submission.csv`."
assert (
"Predicted" in actual_sample_submission.columns
), f"Expected column `Predicted` in `sample_submission.csv`."
assert "Id" in actual_inklabels_rle.columns, f"Expected column `Id` in `inklabels_rle.csv`."
assert (
"Predicted" in actual_inklabels_rle.columns
), f"Expected column `Predicted` in `inklabels_rle.csv`."
assert (
"width" in actual_inklabels_rle.columns
), f"Expected column `width` in `inklabels_rle.csv`."
assert (
"height" in actual_inklabels_rle.columns
), f"Expected column `height` in `inklabels_rle.csv`."
assert len(list((public / "train" / "1").rglob("*"))) == len(
list((raw / "train" / "1").rglob("*"))
), (
f"Expected {len(list(raw / 'train' / '1').rglob('*'))} files in `public/train/1`, got "
f"{len(list(public / 'train' / '1').rglob('*'))} files."
)
assert len(list((public / "train" / "2").rglob("*"))) == len(
list((raw / "train" / "2").rglob("*"))
), (
f"Expected {len(list(raw / 'train' / '2').rglob('*'))} files in `public/train/2`, got "
f"{len(list(public / 'train' / '2').rglob('*'))} files."
)
n_test_actual = len(list((public / "test" / "a").rglob("*")))
n_test_expected = len(list((raw / "train" / "3").rglob("*"))) - len(
["inklabels.png", "inklabels_rle.csv", "ir.png"]
)
assert n_test_actual == n_test_expected, (
f"Expected " f"{n_test_expected} " f"files in `public/test/a`, got {n_test_actual} files."
)