in cc_net/jsonql.py [0:0]
def display_stats(stats, key, weights=None, bins="auto", cumulative=False):
out = []
documents = stats[ALL_DOCUMENTS]
count = stats.get(key, 0)
r = count / documents if documents else 0
out.append(f"Field {key} saw {count} times ({r:5.1%})")
length = stats.get(key + ".length", None)
avg_length = length // count if length else 0
if length is not None:
out[-1] += f", average length is {length // count}"
values = stats.get(key + ".val", None)
if values:
out[-1] += f", histogram is: (bins={bins})"
if weights:
if weights not in stats:
logging.warn(f"Warning: weights column {weights} not found.")
if weights + ".val" not in stats:
logging.warn(
f"Warning: weights column {weights} is not a numeric column."
)
weights = stats.get(weights + ".val")
hist, bins = histogram(values, _parse_bins(bins), weights)
if cumulative:
hist = np.cumsum(hist)
out += bar_chart(hist, bins)
cnt = stats.get(key + ".cnt", None)
if avg_length < MAX_LABEL_LEN and cnt and max(cnt.values()) > 1:
cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
out[-1] += ", top 100 labels:"
for label, n in cnt[:100]:
if n < 5:
continue
out.append(f"{label:25}: {n:6} ({n / count:5.1%})")
return out