def load_generalization_gap()

in domainbed_measures/experiment/experiment.py [0:0]


def load_generalization_gap(out_results: pd.DataFrame,
                            test_envs: List[int],
                            test_env_idx: int,
                            dirty_ood_split: str,
                            model_selection: str = "latest") -> List[List]:
    # For out-of-domain accuracy only look at in+out performance
    if dirty_ood_split not in ["in", "out"]:
        raise ValueError(
            f"Invalid value for dirty_ood_split: {dirty_ood_split}")

    if model_selection != "latest":
        raise ValueError

    ood_out_domains = []
    # Columns with results are like 'env1_out_acc' or 'env2_in_acc' and so on.
    all_envs_acc = [
        x for x in out_results.columns if 'env' in x and 'acc' in x
    ]

    ood_out_domains = []
    wd_out_domains = []
    in_domains = []

    clean_ood_split = "in"
    if dirty_ood_split == "in":
        clean_ood_split = "out"
    del dirty_ood_split

    for e in all_envs_acc:
        if not ('in' in e or 'out' in e):
            raise ValueError("Unexpected env accuracy specifier %s" % (e))
        if int(e.split('_')[0].strip(
                'env')) == test_env_idx and clean_ood_split in e:
            ood_out_domains.append(e)
        elif int(e.split('_')[0].strip('env')) not in test_envs and 'out' in e:
            wd_out_domains.append(e)
        elif int(e.split('_')[0].strip('env')) not in test_envs and 'in' in e:
            in_domains.append(e)

    in_domain_perf = out_results[in_domains].mean(1)
    ood_out_domain_perf = out_results[ood_out_domains].mean(1)
    wd_out_domain_perf = out_results[wd_out_domains].mean(1)

    ood_gap = in_domain_perf - ood_out_domain_perf
    wd_gap = in_domain_perf - wd_out_domain_perf

    return (ood_gap.iloc[-1], wd_gap.iloc[-1], in_domain_perf.iloc[-1],
            ood_out_domain_perf.iloc[-1], wd_out_domain_perf.iloc[-1])