in next_steps/data_science/diagnose/diagnose.py [0:0]
def diagnose_items(df, items):
print("\n=== Items table, original shape={} ===\n"
.format(items.shape))
items = items.copy()
items['ITEM_ID'] = items['ITEM_ID'].astype(str)
items = items.set_index('ITEM_ID')
missing_rate = 1 - df.ITEM_ID.astype(str).isin(set(items.index.values)).mean()
print("Missing rate of all item meta-data", missing_rate)
if missing_rate > NA_RATE_THRESHOLD:
warnings.warn("High missing rate of all item meta-data ({:%})!"
.format(missing_rate))
coldstart_rate = 1 - items.index.isin(set(df.ITEM_ID.astype(str).values)).mean()
print("Item coldstart rate", coldstart_rate)
if coldstart_rate > NA_RATE_THRESHOLD:
warnings.warn("High item coldstart rate ({:%})!"
.format(coldstart_rate))
describe_dataframe(items)
if 'CREATION_TIMESTAMP' in items:
items.index = items['CREATION_TIMESTAMP'].values.astype("datetime64[s]")
items.sort_index(inplace=True)
pl.plot(items.groupby(items.index.date).size())
pl.gcf().autofmt_xdate()
pl.title("daily item creation pattern")
pl.grid()
pl.show()
else:
print("CREATION_TIMESTAMP not found in items table")