def build_sets()

in experiments/sample_datasets.py [0:0]


def build_sets(corr_errs, avg_spread):
    '''
    For each severity 3-8, associate a set of 5 severities with it that
    best match the average spread, where that severity is the middle of
    the five.

    Inputs:
    corr_errs: dictionary where each key is a string "{corr}-{severity}"
      and each value is the test error.
    avg_spread: float specifying the average spread to try to match

    Output:
    dictionary where each key is a string giving the corruption name, 
      and each value is a list of 5-tuples giving all sets of 5 severities
      associated to that corruption.
    '''
    corrs = sorted(list(set([c.split("-")[0] for c in corr_errs.keys()])))
    corr_sets = {c : [] for c in corrs}
    for c in corrs:
        sevs = sorted([float(i.split("-")[1]) for i in corr_errs.keys() if c == i.split("-")[0]])
        for i in np.arange(2, len(sevs)-2):
            # Sev 1
            best = float('inf')
            best_match_s1 = None
            for j in np.arange(0, i-1):
                sep = corr_errs["{}-{}".format(c, sevs[j])] / corr_errs["{}-{}".format(c, sevs[i])] - 1
                sep_sep = abs(-avg_spread - sep)
                if sep_sep <= best:
                    best = sep_sep
                    best_match_s1 = j
            # Sev 2
            best = float('inf')
            best_match_s2 = None
            for j in np.arange(best_match_s1+1, i):
                sep = corr_errs["{}-{}".format(c, sevs[j])] / corr_errs["{}-{}".format(c, sevs[i])] - 1
                sep_sep = abs(-avg_spread/2 - sep)
                if sep_sep <= best:
                    best = sep_sep
                    best_match_s2 = j

            # Sev 5
            best = float('inf')
            best_match_s5 = None
            for j in np.arange(i+2, len(sevs)):
                sep = corr_errs["{}-{}".format(c, sevs[j])] / corr_errs["{}-{}".format(c, sevs[i])] - 1
                sep_sep = abs(avg_spread - sep)
                if sep_sep <= best:
                    best = sep_sep
                    best_match_s5 = j

            # Sev 4
            best = float('inf')
            best_match_s4 = None
            for j in np.arange(i+1, best_match_s5):
                sep = corr_errs["{}-{}".format(c, sevs[j])] / corr_errs["{}-{}".format(c, sevs[i])] - 1
                sep_sep = abs(avg_spread/2 - sep)
                if sep_sep <= best:
                    best = sep_sep
                    best_match_s4 = j

            corr_sets[c].append((sevs[best_match_s1], sevs[best_match_s2], sevs[i], sevs[best_match_s4], sevs[best_match_s5]))
    return corr_sets