def filter_optimal_hps()

in src/analyze.py [0:0]


def filter_optimal_hps(df: pd.DataFrame, opt_metric: str, output_metrics: list):
    """
    For each {dataset, seed, oversampler, classifier} keep only the results of the HP configuration that
    yield the best score according to opt_metric. Then, calculate average and rank for the scores in
    output_metrics

    Parameters
    ----------
    df : pandas.DataFrame
        dataframe of the experiment results
    opt_metric : str
        metric used for optimization
    output_metrics : list
        metrics to include in the output

    Returns
    -------
    dict: pandas.DataFrame
        Filtered and summarized
    """
    num_datasets = len(np.unique(df["dataset"]))
    num_seeds = len(np.unique(df["seed"]))
    # Filter best models according to opt_metric
    df = (
        df.sort_values("param_set", ascending=False)
        .sort_values([opt_metric], ascending=False, kind="stable")
        .groupby(["dataset", "seed", "learner", "oversampler"])
        .agg({om: "first" for om in output_metrics})
        .reset_index()
    )

    # Rank models per dataset and seed
    for om in output_metrics:
        df[f"{om}.rank"] = df.groupby(["dataset", "seed"])[om].rank(ascending=False)
    # Aggregate mean and rank over the datasets
    df = df.groupby(["learner", "seed", "oversampler"]).agg(
        {
            **{om: "mean" for om in output_metrics},
            **{f"{om}.rank": "mean" for om in output_metrics},
            "dataset": "count",
        }
    )
    # Aggregate mean and std over the seeds
    df = df.groupby(["learner", "oversampler"]).agg(
        {
            **{om: ["mean", "std"] for om in output_metrics},
            **{f"{om}.rank": ["mean", "std"] for om in output_metrics},
            "dataset": "sum",
        }
    )
    # Verify that all models have values for all datasets and seeds
    assert np.max(df["dataset"].to_numpy().ravel()) == num_datasets * num_seeds
    assert np.min(df["dataset"].to_numpy().ravel()) == num_datasets * num_seeds
    return df