in mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py [0:0]
def prepare(raw: Path, public: Path, private: Path):
"""
We make train/test split from old train set, using same train/test proportion as the original
competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
copy over the validation set as-is.
`sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
"""
DEV = False
with open(raw / "train_metadata.json", "r") as f:
train_metadata = json.load(f)
train_metadata = train_metadata[:100] if DEV else train_metadata
with open(raw / "validation_metadata.json", "r") as f:
validation_metadata = json.load(f)
if DEV:
new_train, new_test = train_metadata[:90], train_metadata[90:]
else:
new_train, new_test = train_test_split(
train_metadata, test_size=len(validation_metadata), random_state=0
)
logger.info(
f"Created new split with {len(new_train)} train samples and {len(new_test)} test samples"
)
# Create answers csv containing ground truth masks, heights, widths
test_answers = []
for sample in tqdm(new_test):
record_id = sample["record_id"]
mask = np.load(raw / "train" / record_id / "human_pixel_masks.npy")
rle = rle_encode(mask)
rle = " ".join([str(i) for i in rle]) if rle else "-"
band = np.load(raw / "train" / record_id / "band_08.npy")
height, width, _ = band.shape
test_answers.append(
{
"record_id": record_id,
"encoded_pixels": rle,
"height": height,
"width": width,
}
)
test_answers = pd.DataFrame(test_answers)
for sample in tqdm(new_train):
record_id = sample["record_id"]
(public / "train" / record_id).mkdir(exist_ok=True, parents=True)
# Copy bands
for band_idx in range(8, 17):
file_name = f"band_{band_idx:02}.npy"
shutil.copyfile(
src=raw / "train" / record_id / file_name,
dst=public / "train" / record_id / file_name,
)
# Copy human individual masks
shutil.copyfile(
src=raw / "train" / record_id / "human_individual_masks.npy",
dst=public / "train" / record_id / "human_individual_masks.npy",
)
# Copy human pixel masks
shutil.copyfile(
src=raw / "train" / record_id / "human_pixel_masks.npy",
dst=public / "train" / record_id / "human_pixel_masks.npy",
)
for sample in tqdm(new_test):
record_id = sample["record_id"]
(public / "test" / record_id).mkdir(exist_ok=True, parents=True)
# Copy bands
for band_idx in range(8, 17):
file_name = f"band_{band_idx:02}.npy"
shutil.copyfile(
src=raw / "train" / record_id / file_name,
dst=public / "test" / record_id / file_name,
)
# Copy over existing validation data
(raw / "validation").mkdir(exist_ok=True, parents=True)
shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")
# Write other files
with open(public / "train_metadata.json", "w") as f:
f.write(json.dumps(new_train))
test_answers.to_csv(private / "answers.csv", index=False)
submission_df = test_answers.copy()
random.seed(0)
submission_df["encoded_pixels"] = [
random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
]
submission_df.to_csv(public / "sample_submission.csv", index=False)
# Sanity checks
assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
assert (
public / "validation_metadata.json"
).exists(), "`validation_metadata.json` doesn't exist!"
assert (public / "train").exists(), "`train` directory doesn't exist!"
assert (public / "test").exists(), "`test` directory doesn't exist!"
assert (public / "validation").exists(), "`public` directory doesn't exist!"
assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"
new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
assert (
len(new_train_bands) == len(new_train) * 9
), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
assert (
len(new_test_bands) == len(new_test) * 9
), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"
new_train_individual_masks = list(
img.stem for img in (public / "train").rglob("human_individual_masks.npy")
)
assert len(new_train_individual_masks) == len(
new_train
), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
new_test_individual_masks = list(
img.stem for img in (public / "test").rglob("human_individual_masks.npy")
)
assert (
len(new_test_individual_masks) == 0
), f"Expected 0 human individual masks per sample in the test set, but got {len(new_test_individual_masks)}!"
new_train_pixel_masks = list(
img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
)
assert len(new_train_pixel_masks) == len(
new_train
), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
new_test_pixel_masks = list(
img.stem for img in (public / "test").rglob("human_pixel_masks.npy")
)
assert (
len(new_test_pixel_masks) == 0
), f"Expected 0 human pixel masks per sample in the test set, but got {len(new_test_pixel_masks)}!"