def reference_difference()

in prediction_generation/original-project/analysis/scripts/significance.py [0:0]


def reference_difference(avg_ranks, n_datasets, significance_level=0.05):
    N = n_datasets
    k = len(avg_ranks)

    methods = sorted(avg_ranks.keys())
    ranks = [avg_ranks[m] for m in methods]
    ref_method = argmin(lambda m: avg_ranks[m], methods)
    ref_idx = methods.index(ref_method)
    others = [m for m in methods if not m == ref_method]

    Z_scores = [0.0] * (k - 1)
    P_values = [0.0] * (k - 1)

    constant = math.sqrt(6 * N / (k * (k + 1)))
    for j, method in enumerate(others):
        i = methods.index(method)
        Z_scores[j] = (ranks[ref_idx] - ranks[i]) * constant
        P_values[j] = stats.norm.cdf(Z_scores[j])

    # sort the p-values in ascending order
    sorted_pvals = sorted((p, i) for i, p in enumerate(P_values))

    # Calculate significance differences following Holm's procedure
    significant_differences = [False] * (k - 1)
    thresholds = [0] * (k - 1)
    CD_threshold = None
    for i in range(k - 1):
        threshold = significance_level / float(k - (i + 1))
        pval, idx = sorted_pvals[i]
        significant_differences[idx] = pval < threshold
        thresholds[idx] = threshold
        if pval > threshold and CD_threshold is None:
            CD_threshold = threshold

    # Calculate the critical difference from the first threshold that failed to
    # reject. This works because if the p-value would be below the threshold we
    # would consider it significantly different and above the threshold we
    # would not.
    CD = -1 * stats.norm.ppf(CD_threshold) / constant

    txt = [
        "Number of datasets: %i" % N,
        "Number of methods: %i" % k,
        "Reference method: %s" % ref_method,
        "Significance level: %g" % significance_level,
        "",
        "Reference method rank: %.6f" % avg_ranks[ref_method],
        "Holm's procedure:",
    ]

    table = []
    for o, p, t, s in zip(
        others, P_values, thresholds, significant_differences
    ):
        table.append([o, avg_ranks[o], p, t, s])

    txt.append(
        tabulate(
            table,
            headers=["Method", "Rank", "p-Value", "Threshold", "Significant"],
        )
    )

    txt.append("")
    txt.append(
        "Critical difference: %.6f (at threshold = %.6f)" % (CD, CD_threshold)
    )
    txt.append("")

    return ref_method, CD, txt