def prepare_for_metric()

in mlebench/competitions/osic-pulmonary-fibrosis-progression/grade.py [0:0]


def prepare_for_metric(submission: DataFrame, answers: DataFrame) -> dict:

    assert "Patient_Week" in answers.columns, "Answers DataFrame must have a 'Patient_Week' column."
    assert "FVC" in answers.columns, "Answers DataFrame must have a 'FVC' column."
    assert "Patient" in answers.columns, "Answers DataFrame must have a 'Patient' column."
    if "Patient_Week" not in submission.columns:
        raise InvalidSubmissionError("Submission DataFrame must have a 'Patient_Week' column.")
    if "FVC" not in submission.columns:
        raise InvalidSubmissionError("Submission DataFrame must have a 'FVC' column.")
    if "Confidence" not in submission.columns:
        raise InvalidSubmissionError("Submission DataFrame must have a 'Confidence' column.")
    for pw in submission["Patient_Week"]:
        if pw not in answers["Patient_Week"].values:
            raise InvalidSubmissionError(
                f"Patient_Week {pw} in submission does not exist in answers"
            )
    if not pd.api.types.is_numeric_dtype(submission["FVC"]):
        raise InvalidSubmissionError("FVC column in submission must be numeric.")
    if not pd.api.types.is_numeric_dtype(submission["Confidence"]):
        raise InvalidSubmissionError("Confidence column in submission must be numeric.")

    # fillna with 0 for the confidence column
    submission["Confidence"] = submission["Confidence"].fillna(0)

    # We should only take the 3 latest Patient_Week records for each patient
    answers = answers.dropna(subset=["FVC"])  # Drop the dummy data
    answers["Week_Number"] = answers["Patient_Week"].apply(lambda x: int(x.split("_")[-1]))
    latest_weeks = answers.sort_values("Week_Number").groupby("Patient").tail(3)
    answers = latest_weeks.drop(columns=["Week_Number"])
    # Make submission match; we only grade the prediction for the 3 latest weeks
    submission = submission[submission["Patient_Week"].isin(answers["Patient_Week"])]

    submission = submission.sort_values(by="Patient_Week")
    answers = answers.sort_values(by="Patient_Week")

    fvc_true = answers.loc[answers["Patient_Week"].isin(submission["Patient_Week"]), "FVC"].values
    fvc_pred = submission.loc[
        submission["Patient_Week"].isin(answers["Patient_Week"]), "FVC"
    ].values
    confidence = submission.loc[
        submission["Patient_Week"].isin(answers["Patient_Week"]), "Confidence"
    ].values

    return {"fvc_true": fvc_true, "fvc_pred": fvc_pred, "confidence": confidence}