def prepare()

in mlebench/competitions/google-research-identify-contrails-reduce-global-warming/prepare.py [0:0]
112 lines of code
17 McCabe index (conditional complexity)

def prepare(raw: Path, public: Path, private: Path):
    """
    We make train/test split from old train set, using same train/test proportion as the original
    competition. Concretely, the new split has 18673 train samples and 1856 test samples. We also
    copy over the validation set as-is.

    `sample_submission` is created with random predictions, either "1 3 10 5" or "-" (empty)
    """
    DEV = False

    with open(raw / "train_metadata.json", "r") as f:
        train_metadata = json.load(f)
    train_metadata = train_metadata[:100] if DEV else train_metadata
    with open(raw / "validation_metadata.json", "r") as f:
        validation_metadata = json.load(f)

    if DEV:
        new_train, new_test = train_metadata[:90], train_metadata[90:]
    else:
        new_train, new_test = train_test_split(
            train_metadata, test_size=len(validation_metadata), random_state=0
        )
    logger.info(
        f"Created new split with {len(new_train)} train samples and {len(new_test)} test samples"
    )

    # Create answers csv containing ground truth masks, heights, widths
    test_answers = []
    for sample in tqdm(new_test):
        record_id = sample["record_id"]
        mask = np.load(raw / "train" / record_id / "human_pixel_masks.npy")
        rle = rle_encode(mask)
        rle = " ".join([str(i) for i in rle]) if rle else "-"

        band = np.load(raw / "train" / record_id / "band_08.npy")
        height, width, _ = band.shape
        test_answers.append(
            {
                "record_id": record_id,
                "encoded_pixels": rle,
                "height": height,
                "width": width,
            }
        )
    test_answers = pd.DataFrame(test_answers)

    for sample in tqdm(new_train):
        record_id = sample["record_id"]
        (public / "train" / record_id).mkdir(exist_ok=True, parents=True)
        # Copy bands
        for band_idx in range(8, 17):
            file_name = f"band_{band_idx:02}.npy"
            shutil.copyfile(
                src=raw / "train" / record_id / file_name,
                dst=public / "train" / record_id / file_name,
            )
        # Copy human individual masks
        shutil.copyfile(
            src=raw / "train" / record_id / "human_individual_masks.npy",
            dst=public / "train" / record_id / "human_individual_masks.npy",
        )
        # Copy human pixel masks
        shutil.copyfile(
            src=raw / "train" / record_id / "human_pixel_masks.npy",
            dst=public / "train" / record_id / "human_pixel_masks.npy",
        )

    for sample in tqdm(new_test):
        record_id = sample["record_id"]
        (public / "test" / record_id).mkdir(exist_ok=True, parents=True)
        # Copy bands
        for band_idx in range(8, 17):
            file_name = f"band_{band_idx:02}.npy"
            shutil.copyfile(
                src=raw / "train" / record_id / file_name,
                dst=public / "test" / record_id / file_name,
            )

    # Copy over existing validation data
    (raw / "validation").mkdir(exist_ok=True, parents=True)
    shutil.copytree(raw / "validation", public / "validation", dirs_exist_ok=True)
    shutil.copyfile(raw / "validation_metadata.json", public / "validation_metadata.json")

    # Write other files
    with open(public / "train_metadata.json", "w") as f:
        f.write(json.dumps(new_train))
    test_answers.to_csv(private / "answers.csv", index=False)

    submission_df = test_answers.copy()
    random.seed(0)
    submission_df["encoded_pixels"] = [
        random.choice(["1 3 10 5", "-"]) for _ in range(len(submission_df))
    ]
    submission_df.to_csv(public / "sample_submission.csv", index=False)

    # Sanity checks
    assert (public / "train_metadata.json").exists(), "`train_metadata.json` doesn't exist!"
    assert (public / "sample_submission.csv").exists(), "`sample_submission.csv` doesn't exist!"
    assert (
        public / "validation_metadata.json"
    ).exists(), "`validation_metadata.json` doesn't exist!"
    assert (public / "train").exists(), "`train` directory doesn't exist!"
    assert (public / "test").exists(), "`test` directory doesn't exist!"
    assert (public / "validation").exists(), "`public` directory doesn't exist!"
    assert (private / "answers.csv").exists(), "`answers.csv` doesn't exist!"

    new_train_bands = list(img.stem for img in (public / "train").rglob("band*.npy"))
    assert (
        len(new_train_bands) == len(new_train) * 9
    ), f"Expected {len(new_train) * 9} bands in the train set, but got {len(new_train_bands)}!"
    new_test_bands = list(img.stem for img in (public / "test").rglob("band*.npy"))
    assert (
        len(new_test_bands) == len(new_test) * 9
    ), f"Expected {len(new_test) * 9} in the test set, but got {len(new_test_bands)}!"

    new_train_individual_masks = list(
        img.stem for img in (public / "train").rglob("human_individual_masks.npy")
    )
    assert len(new_train_individual_masks) == len(
        new_train
    ), f"Expected 1 human individual mask per sample in the train set, but got {len(new_train_individual_masks)}!"
    new_test_individual_masks = list(
        img.stem for img in (public / "test").rglob("human_individual_masks.npy")
    )
    assert (
        len(new_test_individual_masks) == 0
    ), f"Expected 0 human individual masks per sample in the test set, but got {len(new_test_individual_masks)}!"

    new_train_pixel_masks = list(
        img.stem for img in (public / "train").rglob("human_pixel_masks.npy")
    )
    assert len(new_train_pixel_masks) == len(
        new_train
    ), f"Expected 1 human pixel mask per sample in the train set, but got {len(new_train_pixel_masks)}!"
    new_test_pixel_masks = list(
        img.stem for img in (public / "test").rglob("human_pixel_masks.npy")
    )
    assert (
        len(new_test_pixel_masks) == 0
    ), f"Expected 0 human pixel masks per sample in the test set, but got {len(new_test_pixel_masks)}!"