def diagnose_items()

in next_steps/data_science/diagnose/diagnose.py [0:0]


def diagnose_items(df, items):
    print("\n=== Items table, original shape={} ===\n"
          .format(items.shape))
    items = items.copy()
    items['ITEM_ID'] = items['ITEM_ID'].astype(str)
    items = items.set_index('ITEM_ID')

    missing_rate = 1 - df.ITEM_ID.astype(str).isin(set(items.index.values)).mean()
    print("Missing rate of all item meta-data", missing_rate)
    if missing_rate > NA_RATE_THRESHOLD:
        warnings.warn("High missing rate of all item meta-data ({:%})!"
                      .format(missing_rate))

    coldstart_rate = 1 - items.index.isin(set(df.ITEM_ID.astype(str).values)).mean()
    print("Item coldstart rate", coldstart_rate)
    if coldstart_rate > NA_RATE_THRESHOLD:
        warnings.warn("High item coldstart rate ({:%})!"
                      .format(coldstart_rate))

    describe_dataframe(items)

    if 'CREATION_TIMESTAMP' in items:
        items.index = items['CREATION_TIMESTAMP'].values.astype("datetime64[s]")
        items.sort_index(inplace=True)

        pl.plot(items.groupby(items.index.date).size())
        pl.gcf().autofmt_xdate()
        pl.title("daily item creation pattern")
        pl.grid()
        pl.show()

    else:
        print("CREATION_TIMESTAMP not found in items table")