def perform_subset_analysis()

in domainbed_measures/experiment/regression.py [0:0]


def perform_subset_analysis(data,
                            feature_lambda=None,
                            subset_features=3,
                            single_test_env_only=False,
                            shared_weights_across_envs=False,
                            normalize=False,
                            canon=None,
                            fix_one_feat_to=None,
                            target_name='target_err'):
    print("Performing analysis on subsets of {} features".format(
        subset_features))
    print("--------------------------------------------\n")
    if single_test_env_only == True:
        data = data[data['all_test_envs'].apply(len) == 2]

    if canon is not None:
        for m in canon.index:
            data[m] = canon[m] * data[m]

    regressions = []
    datasets = []
    environments = []
    correlations = []

    common_regressors = {}
    if shared_weights_across_envs == True:
        print("Running regression across all envs.")
        all_features, all_targets = get_data_condition(
            data,
            None,
            None,
            feature_lambda=feature_lambda,
            target_name=target_name,
            normalize=normalize)
        for f, t in get_feature_subsets(all_features, all_targets,
                                        subset_features):
            common_regressors[tuple(f.columns)] = Regression(f, t)

    for dataset in get_datasets(data):
        for environment in get_dataset_environments(data, dataset):
            condition = dataset + environment
            features, target = get_data_condition(
                data,
                dataset,
                environment,
                feature_lambda=feature_lambda,
                target_name=target_name,
                normalize=normalize)
            for f, t in get_feature_subsets(features, target, subset_features):
                if fix_one_feat_to is not None:
                    if fix_one_feat_to not in tuple(f.columns):
                        continue
                if shared_weights_across_envs == True:
                    this_common_regressor = common_regressors[tuple(f.columns)]
                    regressor = copy.copy(this_common_regressor)
                else:
                    regressor = Regression(f, t)
                regressor.score_and_store(f, t)

                if subset_features == 1:
                    correlations.append(scipy.stats.spearmanr(f, t))
                else:
                    correlations.append(None)
                regressions.append(regressor)
                datasets.append(dataset)
                environments.append(environment)

    analysis = SubsetAnalysis(regressions, datasets, environments,
                              correlations)

    return analysis