def prepare()

in mlebench/competitions/stanford-covid-vaccine/prepare.py [0:0]
93 lines of code
6 McCabe index (conditional complexity)

def prepare(raw: Path, public: Path, private: Path) -> None:
    old_train = pd.read_json(raw / "train.json", lines=True)
    old_test = pd.read_json(raw / "test.json", lines=True)
    old_sample_submission = pd.read_csv(raw / "sample_submission.csv")

    to_predict = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]
    test_size = 0.1
    n_test_samples = int(len(old_train) * test_size)

    # only put samples that pass the SN filter in the test set, as per comp data desc
    old_train["test"] = False
    test_indices = (
        old_train[old_train["SN_filter"] > 0].sample(n=n_test_samples, random_state=0).index
    )
    old_train.loc[test_indices, "test"] = True

    new_train = old_train[~old_train["test"]].copy().drop(columns=["test"], inplace=False)
    new_test = old_train[old_train["test"]].copy().drop(columns=["test"], inplace=False)
    old_train = old_train.drop(columns=["test"], inplace=False)

    # Create `test.csv` by exploding each list in the `reactivity` and `deg_*` columns, analogous
    # to `pd.explode`. Only the first `seq_scored` items are scored out of a possible `seq_length`
    # items. For each row, we keep track of whether it's scored or not with the `keep` column.
    records = []

    for _, row in new_test.iterrows():
        n = row["seq_scored"]

        assert len(row["reactivity"]) == n
        assert len(row["deg_Mg_pH10"]) == n
        assert len(row["deg_pH10"]) == n
        assert len(row["deg_Mg_50C"]) == n
        assert len(row["deg_50C"]) == n

        for j in range(n):
            records.append(
                {
                    "id_seqpos": f"{row['id']}_{j}",
                    "reactivity": row["reactivity"][j],
                    "deg_Mg_pH10": row["deg_Mg_pH10"][j],
                    "deg_pH10": row["deg_pH10"][j],
                    "deg_Mg_50C": row["deg_Mg_50C"][j],
                    "deg_50C": row["deg_50C"][j],
                    "keep": True,
                }
            )

        k = row["seq_length"]

        assert n < k

        for j in range(n, k):
            records.append(
                {
                    "id_seqpos": f"{row['id']}_{j}",
                    "reactivity": 0.0,
                    "deg_Mg_pH10": 0.0,
                    "deg_pH10": 0.0,
                    "deg_Mg_50C": 0.0,
                    "deg_50C": 0.0,
                    "keep": False,
                }
            )

    # Write `answers.csv`
    answers = pd.DataFrame(records)
    answers.to_csv(private / "test.csv", index=False, float_format="%.10f")

    # Write `train.json`
    new_train["index"] = range(len(new_train))
    new_train.to_json(public / "train.json", orient="records", lines=True)

    # Write `test.json`
    new_test_without_labels = new_test[old_test.columns].copy()
    new_test_without_labels["index"] = range(len(new_test_without_labels))
    new_test_without_labels.to_json(public / "test.json", orient="records", lines=True)

    # Write `sample_submission.csv`
    new_sample_submission = answers[["id_seqpos"] + to_predict].copy()
    new_sample_submission.loc[:, to_predict] = 0.0
    new_sample_submission.to_csv(
        public / "sample_submission.csv", index=False, float_format="%.10f"
    )

    # Sanity checks
    assert set(new_train.columns) == set(old_train.columns), (
        f"Expected the columns of the new train to be the same as the old train, but got "
        f"{set(new_train.columns)} instead of {set(old_train.columns)}."
    )

    assert set(new_test_without_labels.columns) == set(old_test.columns), (
        f"Expected the columns of the new test to be the same as the old test, but got "
        f"{set(new_test_without_labels.columns)} instead of {set(old_test.columns)}."
    )

    assert set(to_predict).intersection(set(new_test_without_labels.columns)) == set(), (
        f"Expected the columns to predict aren't included in the new test, but got "
        f"{set(to_predict) ^ set(new_test_without_labels.columns)} instead of the empty set."
    )

    assert set(new_sample_submission.columns) == set(old_sample_submission.columns), (
        f"Expected the columns of the new sample submission to be the same as the old sample "
        f"submission, but got {set(new_sample_submission.columns)} instead of "
        f"{set(old_sample_submission.columns)}."
    )

    assert len(answers) == len(new_sample_submission), (
        f"Expected the answers to have the same length as the new sample submission, but got "
        f"{len(answers)} instead of {len(new_sample_submission)}."
    )

    # we can use [0] because all sequences have the same length
    assert len(new_sample_submission) == (
        len(new_test_without_labels) * new_test_without_labels["seq_length"].iloc[0]
    ), (
        "Expected new_sample_submission length to be equal to max seq_length * len(new_test)."
        f"Got {len(new_sample_submission)} instead of {len(new_test_without_labels) * new_test_without_labels['seq_length']}."
    )

    assert len(new_train) + len(new_test) == len(old_train), (
        f"Expected the length of the new train set plus the length of the new test set to be "
        f"equal to the length of the old train set, but got {len(new_train) + len(new_test)} "
        f"instead of {len(old_train)}."
    )