in next_steps/data_science/diagnose/diagnose.py [0:0]
def describe_categorical(sr, name=''):
print("\n=== {} top {} categories ===".format(name, CATS_FREQ_HEAD))
parts = sr.astype(str).apply(lambda x: x.split('|'))
cats = pd.Series(np.hstack(parts.values))
cats_freq = cats.groupby(cats).size().sort_values(ascending=False)
print(cats_freq.head(CATS_FREQ_HEAD))
if len(cats_freq) <= LOGLOG_MIN_CATS:
return None
(slope, intercept, rmse) = plot_loglog(cats_freq, name)
if len(cats_freq) > LOGLOG_MIN_CATS and rmse < LOGLOG_RMSE_THRESHOLD:
if slope > LOGLOG_HEAVY_TAIL:
warnings.warn("""