in domainbed_measures/extract_generalization_features.py [0:0]
def extract_features_for_regression(results,
feature_name_list,
wd_or_ood='ood',):
def numeric_list_to_str(x):
if isinstance(x, list):
x = sorted(x)
return '_' + '_'.join([str(ex) for ex in x])
return x
results['all_test_envs'] = results['all_test_envs'].apply(
numeric_list_to_str)
all_test_envs = results['all_test_envs'].unique()
header = ["dataset", "all_test_envs", "test_env"]
for f in feature_name_list:
header.append(f)
if "c2st" in f or "hdh" in f:
header += ["%s_lambda" % (f)]
header += ["%s_perr" % (f)]
header += ["%s_perr_plambda" % (f)]
header.append("target_err")
if args.canonicalize == True:
canonicalizer = CANONICALIZATION
else:
canonicalizer = defaultdict(lambda: 1)
# We solve one regression problem per test environment
all_rows = []
for env in all_test_envs:
print(f'Extracting features for env {env}')
test_envs_with_results = list(
results[results['all_test_envs'] == env]['test_env'].unique())
for this_test_env in test_envs_with_results:
results_env_test = results[(results['all_test_envs'] == env) &
(results['test_env'] == this_test_env)]
for _, path in tqdm(enumerate(results_env_test['path'].unique())):
this_row = []
this_row.append(
results_env_test[results_env_test['path'] == path]
['dataset'].reset_index(drop=True)[0])
this_row.append(env)
this_row.append(this_test_env)
feature = []
for feat_name in feature_name_list:
feature_access_str = 'gen_measure_val_%s' % (wd_or_ood)
if feat_name in list(results_env_test[
results_env_test['path'] == path]['measure']):
feature.append(results_env_test[
(results_env_test['path'] == path)
& (results_env_test['measure'] == feat_name)]
[feature_access_str].mean() *
canonicalizer[feat_name])
if "c2st" in feat_name or "hdh" in feat_name:
c2st_or_hdh_feature = results_env_test[
(results_env_test['path'] == path)
& (results_env_test['measure'] == feat_name
)][feature_access_str].mean()
lambda_closeness = results_env_test[
(results_env_test['path'] == path)
& (results_env_test['measure'] == feat_name)][
"lambda_%s" % (wd_or_ood)].mean()
wd_err_path = (1.0 - results_env_test[
results_env_test['path'] == path]
['wd_out_domain_perf'].mean())
feature.append(lambda_closeness)
feature.append(c2st_or_hdh_feature + wd_err_path)
feature.append(c2st_or_hdh_feature + wd_err_path +
lambda_closeness)
elif feat_name == 'wd_out_domain_err':
feature.append(
1.0 -
results_env_test[results_env_test['path'] == path]
['wd_out_domain_perf'].mean() *
canonicalizer['wd_out_domain_err'])
else:
# Perform imputation based on the mean
feature.append(results_env_test[
results_env_test['measure'] == feat_name]
[feature_access_str].mean() *
canonicalizer[feat_name])
if pd.isnull(feature[-1]):
feature[-1] = 0.0
if "c2st" in feat_name or "hdh" in feat_name:
lambda_closeness = results_env_test[
results_env_test['measure'] == feat_name][
"lambda_%s" % (wd_or_ood)].mean()
c2st_or_hdh_feature = results_env_test[(
results_env_test['measure'] == feat_name
)][feature_access_str].mean()
wd_err_path = (1.0 - results_env_test[
results_env_test['path'] == path]
['wd_out_domain_perf'].mean())
feature.append(lambda_closeness)
if pd.isnull(feature[-1]):
feature[-1] = -1.0
feature.append(c2st_or_hdh_feature + wd_err_path)
if pd.isnull(feature[-1]):
feature[-1] = -1.0
feature.append(c2st_or_hdh_feature + wd_err_path +
lambda_closeness)
if pd.isnull(feature[-1]):
feature[-1] = -1.0
this_row.extend(feature)
if wd_or_ood == 'ood':
this_row.append(
1.0 -
results_env_test[results_env_test['path'] ==
path]['ood_out_domain_perf'].mean())
elif wd_or_ood == 'wd':
this_row.append(
1.0 -
results_env_test[results_env_test['path'] ==
path]['wd_out_domain_perf'].mean())
all_rows.append(this_row)
return header, all_rows