def describe_categorical()

in next_steps/data_science/diagnose/diagnose.py [0:0]


def describe_categorical(sr, name=''):
    print("\n=== {} top {} categories ===".format(name, CATS_FREQ_HEAD))
    parts = sr.astype(str).apply(lambda x: x.split('|'))
    cats = pd.Series(np.hstack(parts.values))
    cats_freq = cats.groupby(cats).size().sort_values(ascending=False)
    print(cats_freq.head(CATS_FREQ_HEAD))

    if len(cats_freq) <= LOGLOG_MIN_CATS:
        return None

    (slope, intercept, rmse) = plot_loglog(cats_freq, name)

    if len(cats_freq) > LOGLOG_MIN_CATS and rmse < LOGLOG_RMSE_THRESHOLD:
        if slope > LOGLOG_HEAVY_TAIL:
            warnings.warn("""