in src/rime/dataset/__init__.py [0:0]
def prepare_minimal_dataset():
""" minimal dataset to enable main workflow in unit tests """
event_df = pd.DataFrame([
["u1", "i1", 3],
["u2", "i2", 5],
["u3", "i3", 7],
["u3", "i4", 9],
], columns=["USER_ID", "ITEM_ID", "TIMESTAMP"])
user_df = pd.Series({
"u1": 4,
"u2": float("inf"), # +inf=training-only user, unless added after create_dataset
"u3": 9,
}).to_frame("TEST_START_TIME")
item_df = pd.DataFrame(index=["i1", "i2", "i3", "i4"])
horizon = 100
# mark and trim _holdout by [TEST_START_TIME, TEST_START_TIME + horizon)
# can be customized by setting _holdout as 0=training and 1=testing.
event_df = _mark_and_trim_holdout(event_df, user_df, horizon)
user_df = _augment_user_hist(user_df, event_df) # add _hist_items, _hist_ts, _hist_len
item_df = _augment_item_hist(item_df, event_df) # add _hist_len
training_data = argparse.Namespace(
user_df=user_df, item_df=item_df, event_df=event_df
)
# Here is a walk-through of create_dataset function, except that the function
# automatically includes users and items by min_user/item_len and TEST_START_TIME<inf,
# whereas we manually choose them.
# New users/items will get zero prediction scores; they are better included in
# training data, albeit having empty lists of events.
user_in_test = _reindex_user_hist(user_df[[
'_hist_items', '_hist_len', '_hist_ts', 'TEST_START_TIME',
]], ['u1', 'u3', 'oov_users_get_all_zero_scores'])
item_in_test = item_df[['_hist_len']].reindex([
'i1', 'i4', 'oov_items_get_all_zero_scores'
], fill_value=0)
target_csr = create_matrix(event_df[event_df['_holdout'] == 1],
user_in_test.index, item_in_test.index, 'csr')
# excluding seen user-item pairs leads to performance with matrix factorization methods
prior_score = create_matrix(event_df[event_df['_holdout'] == 0],
user_in_test.index, item_in_test.index, 'csr') * -1e10
# test targets should only include predictable user-item pairs
target_csr = target_csr.multiply(target_csr.astype(bool) > (prior_score < 0))
D = Dataset(
user_in_test=user_in_test, item_in_test=item_in_test, target_csr=target_csr,
horizon=horizon, prior_score=prior_score, training_data=training_data
)
D.print_stats()
return (D, None)