def display_stats()

in cc_net/jsonql.py [0:0]


def display_stats(stats, key, weights=None, bins="auto", cumulative=False):
    out = []
    documents = stats[ALL_DOCUMENTS]
    count = stats.get(key, 0)
    r = count / documents if documents else 0
    out.append(f"Field {key} saw {count} times ({r:5.1%})")

    length = stats.get(key + ".length", None)
    avg_length = length // count if length else 0
    if length is not None:
        out[-1] += f", average length is {length // count}"

    values = stats.get(key + ".val", None)
    if values:
        out[-1] += f", histogram is: (bins={bins})"
        if weights:
            if weights not in stats:
                logging.warn(f"Warning: weights column {weights} not found.")
            if weights + ".val" not in stats:
                logging.warn(
                    f"Warning: weights column {weights} is not a numeric column."
                )
            weights = stats.get(weights + ".val")
        hist, bins = histogram(values, _parse_bins(bins), weights)
        if cumulative:
            hist = np.cumsum(hist)
        out += bar_chart(hist, bins)

    cnt = stats.get(key + ".cnt", None)
    if avg_length < MAX_LABEL_LEN and cnt and max(cnt.values()) > 1:
        cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        out[-1] += ", top 100 labels:"
        for label, n in cnt[:100]:
            if n < 5:
                continue
            out.append(f"{label:25}: {n:6} ({n / count:5.1%})")

    return out