def describe()

in cc_net/jsonql.py [0:0]


def describe(source, columns=None, weights=None, **kwargs):
    """Compute some statistics about a dataset.

    Stats can be restricted to a subset of columns."""
    MAX_HIST_SIZE = 100_000_000
    MAX_CNT_SIZE = 1000
    stats = {ALL_DOCUMENTS: 0}
    needed = columns + [weights] if columns else None

    for doc in read_jsons(source):
        stats[ALL_DOCUMENTS] += 1
        for k, v in doc.items():
            if needed and k not in needed:
                continue
            stats[k] = get_or_set(stats, k, 0) + 1
            if isinstance(v, str):
                stats[k + ".length"] = get_or_set(stats, k + ".length", 0) + len(v)
                if len(v) > MAX_LABEL_LEN:  # Don't treat too long string as labels
                    continue
                cnt = get_or_set(stats, k + ".cnt", collections.defaultdict(int))
                if v in cnt or len(cnt) < MAX_CNT_SIZE:
                    cnt[v] += 1
            elif type(v) in (int, float):
                values = get_or_set(stats, k + ".val", [])
                if len(values) < MAX_HIST_SIZE:
                    values.append(v)
            elif type(v) is list and len(v) and type(v[0]) in (int, float):
                values = get_or_set(stats, k + ".val", [])
                if len(values) < MAX_HIST_SIZE:
                    values += v
            elif type(v) is dict:
                cnt = get_or_set(stats, k + ".cnt", collections.defaultdict(int))
                for label in v:
                    if label in cnt or len(cnt) < MAX_CNT_SIZE:
                        cnt[label] += 1

    documents = stats[ALL_DOCUMENTS]
    yield f"Stats computed on {documents} documents:"
    for k in stats:
        if columns and k not in columns:
            continue
        if "." in k or k == ALL_DOCUMENTS:
            continue
        for line in display_stats(stats, k, weights=weights, **kwargs):
            yield line