def prepare()

in mlebench/competitions/osic-pulmonary-fibrosis-progression/prepare.py [0:0]
62 lines of code
10 McCabe index (conditional complexity)

def prepare(raw: Path, public: Path, private: Path):
    # Splitting the train set into train and test with unique Patients
    old_train = read_csv(raw / "train.csv")
    grouped_by_patient = list(old_train.groupby("Patient"))
    train_groups, test_groups = train_test_split(grouped_by_patient, test_size=0.1, random_state=0)
    new_train = pd.concat([group for _, group in train_groups])
    new_test = pd.concat([group for _, group in test_groups])
    assert set(new_train["Patient"]).isdisjoint(
        set(new_test["Patient"])
    ), "There are Patients who are in both train and test sets."

    # For the public new_test set we will only keep each patients first FVS measurement. The task is to predict FVS measurements for all possible weeks
    new_test_public = new_test.sort_values(by="Weeks").groupby("Patient").first().reset_index()

    # Creating the private answers CSV. We need to fill out dummy FVS measurements for all weeks that don't have data so as to match sample_submission.csv
    # Create a DataFrame with all possible Patient-Week combinations
    all_weeks = pd.DataFrame(
        [
            (patient, week)
            for patient in new_test["Patient"].unique()
            for week in range(new_test["Weeks"].min(), new_test["Weeks"].max() + 1)
        ],
        columns=["Patient", "Weeks"],
    )
    # Merge with the new_test DataFrame to fill in missing weeks with NaN values
    new_test = all_weeks.merge(new_test, on=["Patient", "Weeks"], how="left")
    new_test["Patient_Week"] = new_test["Patient"] + "_" + new_test["Weeks"].astype(str)
    new_test["Confidence"] = 100
    assert (
        new_test.groupby("Patient").size().nunique() == 1
    ), "Not all patients have the same number of rows."

    # Create a sample submission file
    submission_df = new_test.copy()
    submission_df = submission_df[["Patient_Week"]]
    submission_df["FVC"] = 2000  # Dummy predictions
    submission_df["Confidence"] = 100  # Dummy confidence

    # Write CSVs
    new_train.to_csv(public / "train.csv", index=False)
    new_test_public.to_csv(public / "test.csv", index=False)
    new_test.to_csv(private / "test.csv", index=False)
    submission_df.to_csv(public / "sample_submission.csv", index=False)

    # Copy over data files
    (public / "train").mkdir(exist_ok=True)
    train_args = [
        (raw / "train" / patient, public / "train" / patient)
        for patient in new_train["Patient"].unique()
    ]
    with Pool() as pool:
        list(
            tqdm(pool.imap(copy_dir, train_args), total=len(train_args), desc="Copying train data")
        )

    (public / "test").mkdir(exist_ok=True)
    test_args = [
        (raw / "train" / patient, public / "test" / patient)
        for patient in new_test["Patient"].unique()
    ]
    with Pool() as pool:
        list(tqdm(pool.imap(copy_dir, test_args), total=len(test_args), desc="Copying test data"))

    # Final checks
    assert new_train.shape[1] == 7, f"Expected 7 columns in new_train, but got {new_train.shape[1]}"
    assert new_test.shape[1] == 9, f"Expected 9 columns in new_test, but got {new_test.shape[1]}"
    assert (
        new_test_public.shape[1] == 7
    ), f"Expected 7 columns in new_test_public, but got {new_test_public.shape[1]}"
    assert (
        submission_df.shape[1] == 3
    ), f"Expected 3 columns in submission_df, but got {submission_df.shape[1]}"

    public_train_dirs = set((public / "train").iterdir())
    public_test_dirs = set((public / "test").iterdir())
    common_dirs = public_train_dirs.intersection(public_test_dirs)
    assert (
        not common_dirs
    ), f"There are directories with the same name in public train and test: {common_dirs}"