def extract_features_for_regression()

in domainbed_measures/extract_generalization_features.py [0:0]


def extract_features_for_regression(results,
                                    feature_name_list,
                                    wd_or_ood='ood',):
    def numeric_list_to_str(x):
        if isinstance(x, list):
            x = sorted(x)
            return '_' + '_'.join([str(ex) for ex in x])
        return x

    results['all_test_envs'] = results['all_test_envs'].apply(
        numeric_list_to_str)
    all_test_envs = results['all_test_envs'].unique()

    header = ["dataset", "all_test_envs", "test_env"]
    for f in feature_name_list:
        header.append(f)
        if "c2st" in f or "hdh" in f:
            header += ["%s_lambda" % (f)]
            header += ["%s_perr" % (f)]
            header += ["%s_perr_plambda" % (f)]
    header.append("target_err")
    if args.canonicalize == True:
        canonicalizer = CANONICALIZATION
    else:
        canonicalizer = defaultdict(lambda: 1)

    # We solve one regression problem per test environment
    all_rows = []
    for env in all_test_envs:
        print(f'Extracting features for env {env}')
        test_envs_with_results = list(
            results[results['all_test_envs'] == env]['test_env'].unique())

        for this_test_env in test_envs_with_results:
            results_env_test = results[(results['all_test_envs'] == env) &
                                       (results['test_env'] == this_test_env)]

            for _, path in tqdm(enumerate(results_env_test['path'].unique())):
                this_row = []
                this_row.append(
                    results_env_test[results_env_test['path'] == path]
                    ['dataset'].reset_index(drop=True)[0])
                this_row.append(env)
                this_row.append(this_test_env)

                feature = []
                for feat_name in feature_name_list:
                    feature_access_str = 'gen_measure_val_%s' % (wd_or_ood)
                    if feat_name in list(results_env_test[
                            results_env_test['path'] == path]['measure']):
                        feature.append(results_env_test[
                            (results_env_test['path'] == path)
                            & (results_env_test['measure'] == feat_name)]
                                       [feature_access_str].mean() *
                                       canonicalizer[feat_name])
                        if "c2st" in feat_name or "hdh" in feat_name:
                            c2st_or_hdh_feature = results_env_test[
                                (results_env_test['path'] == path)
                                & (results_env_test['measure'] == feat_name
                                   )][feature_access_str].mean()
                            lambda_closeness = results_env_test[
                                (results_env_test['path'] == path)
                                & (results_env_test['measure'] == feat_name)][
                                    "lambda_%s" % (wd_or_ood)].mean()
                            wd_err_path = (1.0 - results_env_test[
                                results_env_test['path'] == path]
                                           ['wd_out_domain_perf'].mean())
                            feature.append(lambda_closeness)
                            feature.append(c2st_or_hdh_feature + wd_err_path)
                            feature.append(c2st_or_hdh_feature + wd_err_path +
                                           lambda_closeness)
                    elif feat_name == 'wd_out_domain_err':
                        feature.append(
                            1.0 -
                            results_env_test[results_env_test['path'] == path]
                            ['wd_out_domain_perf'].mean() *
                            canonicalizer['wd_out_domain_err'])
                    else:
                        # Perform imputation based on the mean
                        feature.append(results_env_test[
                            results_env_test['measure'] == feat_name]
                                       [feature_access_str].mean() *
                                       canonicalizer[feat_name])
                        if pd.isnull(feature[-1]):
                            feature[-1] = 0.0

                        if "c2st" in feat_name or "hdh" in feat_name:
                            lambda_closeness = results_env_test[
                                results_env_test['measure'] == feat_name][
                                    "lambda_%s" % (wd_or_ood)].mean()

                            c2st_or_hdh_feature = results_env_test[(
                                results_env_test['measure'] == feat_name
                            )][feature_access_str].mean()
                            wd_err_path = (1.0 - results_env_test[
                                results_env_test['path'] == path]
                                           ['wd_out_domain_perf'].mean())

                            feature.append(lambda_closeness)
                            if pd.isnull(feature[-1]):
                                feature[-1] = -1.0
                            feature.append(c2st_or_hdh_feature + wd_err_path)
                            if pd.isnull(feature[-1]):
                                feature[-1] = -1.0
                            feature.append(c2st_or_hdh_feature + wd_err_path +
                                           lambda_closeness)
                            if pd.isnull(feature[-1]):
                                feature[-1] = -1.0

                this_row.extend(feature)
                if wd_or_ood == 'ood':
                    this_row.append(
                        1.0 -
                        results_env_test[results_env_test['path'] ==
                                         path]['ood_out_domain_perf'].mean())
                elif wd_or_ood == 'wd':
                    this_row.append(
                        1.0 -
                        results_env_test[results_env_test['path'] ==
                                         path]['wd_out_domain_perf'].mean())

                all_rows.append(this_row)

    return header, all_rows