def report_top_subsets()

in domainbed_measures/experiment/regression.py [0:0]


def report_top_subsets(analysis,
                       filter_dataset,
                       sort_by,
                       top_k=100,
                       canonicalize=False):
    results = {}

    for subset in analysis.all_feature_name_subsets():
        analysis_subset = analysis.select(feature_name=subset,
                                          dataset_name=filter_dataset)
        results[subset] = {
            'score': analysis_subset.score(),
            'weight_variance': analysis_subset.weight_variance(),
            'times_same_sign': analysis_subset.weight_sign_changes(),
            'correlation_no_fit': analysis_subset.corr_without_fit(),
            'correlation_with_fit': analysis_subset.corr_score_with_fit(),
        }

    print("Best subsets on %s according to %s score:" %
          (filter_dataset, sort_by))

    results_df = pd.DataFrame(results)
    for k, v in dict_top_k(results, sort_by, top_k=top_k,
                           reverse=True).items():
        print(
            "{0}] R2:{1:.3f}| corr: {2:.3f} | corr/w/fit {3:.3f} times_same_sign: {4}"
            .format(
                k,
                v['score'],
                v['correlation_no_fit'],
                v['correlation_with_fit'],
                v['times_same_sign'],
            ))

    results = results_df.transpose()
    results['measure'] = [str(x[0]) for x in results.index]
    results = results.reset_index()

    canon = None
    if canonicalize == True:
        canon = (results['correlation_no_fit'] > 0).astype(float)
        canon[canon == 0.0] = -1.0
        results['correlation_no_fit'] = canon * results['correlation_no_fit']
        results[
            'correlation_with_fit'] = canon * results['correlation_with_fit']
        canon.index = results['measure']

    return results, canon