in src/analyze.py [0:0]
def filter_optimal_hps(df: pd.DataFrame, opt_metric: str, output_metrics: list):
"""
For each {dataset, seed, oversampler, classifier} keep only the results of the HP configuration that
yield the best score according to opt_metric. Then, calculate average and rank for the scores in
output_metrics
Parameters
----------
df : pandas.DataFrame
dataframe of the experiment results
opt_metric : str
metric used for optimization
output_metrics : list
metrics to include in the output
Returns
-------
dict: pandas.DataFrame
Filtered and summarized
"""
num_datasets = len(np.unique(df["dataset"]))
num_seeds = len(np.unique(df["seed"]))
# Filter best models according to opt_metric
df = (
df.sort_values("param_set", ascending=False)
.sort_values([opt_metric], ascending=False, kind="stable")
.groupby(["dataset", "seed", "learner", "oversampler"])
.agg({om: "first" for om in output_metrics})
.reset_index()
)
# Rank models per dataset and seed
for om in output_metrics:
df[f"{om}.rank"] = df.groupby(["dataset", "seed"])[om].rank(ascending=False)
# Aggregate mean and rank over the datasets
df = df.groupby(["learner", "seed", "oversampler"]).agg(
{
**{om: "mean" for om in output_metrics},
**{f"{om}.rank": "mean" for om in output_metrics},
"dataset": "count",
}
)
# Aggregate mean and std over the seeds
df = df.groupby(["learner", "oversampler"]).agg(
{
**{om: ["mean", "std"] for om in output_metrics},
**{f"{om}.rank": ["mean", "std"] for om in output_metrics},
"dataset": "sum",
}
)
# Verify that all models have values for all datasets and seeds
assert np.max(df["dataset"].to_numpy().ravel()) == num_datasets * num_seeds
assert np.min(df["dataset"].to_numpy().ravel()) == num_datasets * num_seeds
return df