in cc_net/jsonql.py [0:0]
def describe(source, columns=None, weights=None, **kwargs):
"""Compute some statistics about a dataset.
Stats can be restricted to a subset of columns."""
MAX_HIST_SIZE = 100_000_000
MAX_CNT_SIZE = 1000
stats = {ALL_DOCUMENTS: 0}
needed = columns + [weights] if columns else None
for doc in read_jsons(source):
stats[ALL_DOCUMENTS] += 1
for k, v in doc.items():
if needed and k not in needed:
continue
stats[k] = get_or_set(stats, k, 0) + 1
if isinstance(v, str):
stats[k + ".length"] = get_or_set(stats, k + ".length", 0) + len(v)
if len(v) > MAX_LABEL_LEN: # Don't treat too long string as labels
continue
cnt = get_or_set(stats, k + ".cnt", collections.defaultdict(int))
if v in cnt or len(cnt) < MAX_CNT_SIZE:
cnt[v] += 1
elif type(v) in (int, float):
values = get_or_set(stats, k + ".val", [])
if len(values) < MAX_HIST_SIZE:
values.append(v)
elif type(v) is list and len(v) and type(v[0]) in (int, float):
values = get_or_set(stats, k + ".val", [])
if len(values) < MAX_HIST_SIZE:
values += v
elif type(v) is dict:
cnt = get_or_set(stats, k + ".cnt", collections.defaultdict(int))
for label in v:
if label in cnt or len(cnt) < MAX_CNT_SIZE:
cnt[label] += 1
documents = stats[ALL_DOCUMENTS]
yield f"Stats computed on {documents} documents:"
for k in stats:
if columns and k not in columns:
continue
if "." in k or k == ALL_DOCUMENTS:
continue
for line in display_stats(stats, k, weights=weights, **kwargs):
yield line