def prepare()

in mlebench/competitions/mlsp-2013-birds/prepare.py [0:0]


def prepare(raw: Path, public: Path, private: Path):
    """
    Splits the data in raw into public and private datasets with appropriate test/train splits.
    """
    # extract only what we need
    extract(raw / "mlsp_contest_dataset.zip", raw)

    (public / "essential_data").mkdir(exist_ok=True)
    (public / "supplemental_data").mkdir(exist_ok=True)

    # Create train, test from train split
    cv_folds = read_csv(raw / "mlsp_contest_dataset/essential_data/CVfolds_2.txt")
    cv_folds = cv_folds[cv_folds["fold"] == 0].reset_index(drop=True)
    cv_folds.loc[cv_folds.sample(frac=0.2, random_state=0).index, "fold"] = 1

    old_id_to_new = {old_id: new_id for new_id, old_id in enumerate(cv_folds["rec_id"].values)}
    cv_folds["rec_id"] = cv_folds.index
    cv_folds.to_csv(public / "essential_data/CVfolds_2.txt", index=False)

    test_rec_ids = cv_folds[cv_folds["fold"] == 1]["rec_id"].values
    assert len(test_rec_ids) == 64, f"Expected 64 test rec_ids, got {len(test_rec_ids)}"

    # Update id2filename with new split
    rec_id2filename = read_csv(raw / "mlsp_contest_dataset/essential_data/rec_id2filename.txt")
    rec_id2filename = rec_id2filename[rec_id2filename["rec_id"].isin(old_id_to_new.keys())]
    rec_id2filename["rec_id"] = rec_id2filename["rec_id"].map(old_id_to_new)
    rec_id2filename.to_csv(public / "essential_data/rec_id2filename.txt", index=False)
    assert len(rec_id2filename) == len(
        cv_folds
    ), f"Expected {len(cv_folds)} entires in rec_id2filename, got {len(rec_id2filename)}"

    # Update labels with new split
    rec_labels = (
        open(raw / "mlsp_contest_dataset/essential_data/rec_labels_test_hidden.txt")
        .read()
        .splitlines()
    )
    rec_labels = rec_labels[1:]  # Ignore header line
    rec_labels_split = []
    for i in rec_labels:
        rec_id = i.split(",")[0]
        labels = ",".join(i.split(",")[1:]) if len(i.split(",")) > 1 else ""
        rec_labels_split.append((int(rec_id), labels))
    rec_labels_split = [i for i in rec_labels_split if i[0] in old_id_to_new.keys()]
    rec_labels_split = [(old_id_to_new[i[0]], i[1]) for i in rec_labels_split]

    # Public labels
    with open(public / "essential_data/rec_labels_test_hidden.txt", "w") as f:
        f.write("rec_id,[labels]\n")
        for rec_id, labels in rec_labels_split:
            if rec_id in test_rec_ids:
                labels = "?"
            if labels == "":  # Write without comma
                f.write(f"{rec_id}{labels}\n")
            else:
                f.write(f"{rec_id},{labels}\n")

    # Private labels. Create csv, with each row containing the label for a (rec_id, species_id) pair
    data = {"Id": [], "Probability": []}
    for rec_id, labels in rec_labels_split:
        if rec_id not in test_rec_ids:
            continue
        species_ids = [int(i) for i in labels.split(",") if i != ""]
        for species_id in range(0, 19):
            data["Id"].append(rec_id * 100 + species_id)
            data["Probability"].append(int(species_id in species_ids))

    pd.DataFrame(data).to_csv(private / "answers.csv", index=False)
    assert (
        len(pd.DataFrame(data)) == len(test_rec_ids) * 19
    ), f"Expected {len(test_rec_ids)*19} entires in answers.csv, got {len(pd.DataFrame(data))}"

    # Create new sample submission, following new submission format
    # http://www.kaggle.com/c/mlsp-2013-birds/forums/t/4961/new-submission-parser
    data = {
        "Id": [rec_id * 100 + species_id for rec_id in test_rec_ids for species_id in range(0, 19)],
        "Probability": 0,
    }
    pd.DataFrame(data).to_csv(public / "sample_submission.csv", index=False)
    assert (
        len(pd.DataFrame(data)) == len(test_rec_ids) * 19
    ), f"Expected {len(test_rec_ids)*19} entires in sample_submission.csv, got {len(pd.DataFrame(data))}"

    # Copy over species list
    shutil.copyfile(
        src=raw / "mlsp_contest_dataset/essential_data/species_list.txt",
        dst=public / "essential_data/species_list.txt",
    )

    # Copy over all src waves from train+test set
    (public / "essential_data/src_wavs").mkdir(exist_ok=True)
    for filename in rec_id2filename["filename"]:
        shutil.copyfile(
            src=raw / "mlsp_contest_dataset/essential_data/src_wavs" / f"{filename}.wav",
            dst=public / "essential_data/src_wavs" / f"{filename}.wav",
        )

    # Copy over train+test filtered spectrograms, segmentation examples, spectrograms, and supervised segmentation
    (public / "supplemental_data/filtered_spectrograms").mkdir(exist_ok=True)
    (public / "supplemental_data/segmentation_examples").mkdir(exist_ok=True)
    (public / "supplemental_data/spectrograms").mkdir(exist_ok=True)
    (public / "supplemental_data/supervised_segmentation").mkdir(exist_ok=True)
    for filename in rec_id2filename["filename"]:
        shutil.copyfile(
            src=raw
            / "mlsp_contest_dataset/supplemental_data/filtered_spectrograms"
            / f"{filename}.bmp",
            dst=public / "supplemental_data/filtered_spectrograms" / f"{filename}.bmp",
        )
        if os.path.exists(
            raw / "mlsp_contest_dataset/supplemental_data/segmentation_examples" / f"{filename}.bmp"
        ):
            shutil.copyfile(
                src=raw
                / "mlsp_contest_dataset/supplemental_data/segmentation_examples"
                / f"{filename}.bmp",
                dst=public / "supplemental_data/segmentation_examples" / f"{filename}.bmp",
            )
        shutil.copyfile(
            src=raw / "mlsp_contest_dataset/supplemental_data/spectrograms" / f"{filename}.bmp",
            dst=public / "supplemental_data/spectrograms" / f"{filename}.bmp",
        )
        shutil.copyfile(
            src=raw
            / "mlsp_contest_dataset/supplemental_data/supervised_segmentation"
            / f"{filename}.bmp",
            dst=public / "supplemental_data/supervised_segmentation" / f"{filename}.bmp",
        )

    # Copy over remaining files
    shutil.copyfile(
        src=raw / "mlsp_contest_dataset/supplemental_data/segment_clusters.bmp",
        dst=public / "supplemental_data/segment_clusters.bmp",
    )
    shutil.copyfile(
        src=raw / "mlsp_contest_dataset/supplemental_data/segment_mosaic.bmp",
        dst=public / "supplemental_data/segment_mosaic.bmp",
    )

    filter_and_write_file(
        src=raw / "mlsp_contest_dataset/supplemental_data/histogram_of_segments.txt",
        dst=public / "supplemental_data/histogram_of_segments.txt",
        old_id_to_new=old_id_to_new,
    )
    filter_and_write_file(
        src=raw / "mlsp_contest_dataset/supplemental_data/segment_features.txt",
        dst=public / "supplemental_data/segment_features.txt",
        old_id_to_new=old_id_to_new,
    )
    filter_and_write_file(
        src=raw / "mlsp_contest_dataset/supplemental_data/segment_rectangles.txt",
        dst=public / "supplemental_data/segment_rectangles.txt",
        old_id_to_new=old_id_to_new,
    )