in src/rime/dataset/base.py [0:0]
def create_dataset(event_df, user_df, item_df, horizon=float("inf"),
min_user_len=1, min_item_len=1, prior_score=None, exclude_train=False,
test_incl_users_with_posinf_test_time=False,
test_incl_users_with_neginf_test_time=True,
):
""" Create a labeled dataset from 3 related tables and additional configurations.
:parameter event_df: [USER_ID, ITEM_ID, TIMESTAMP]
:parameter user_df: [USER_ID (index), TEST_START_TIME]
:parameter item_df: [ITEM_ID (index)]
:parameter horizon: extract test window from TIMESTAMP, TEST_START_TIME, and horizon
:parameter min_user_len: filter out test users with empty histories to avoid biases
:parameter min_item_len: filter out test items with empty histories to avoid biases
:parameter prior_score: add a prior score to boost/penalize certain user-item pairs
in prediction
:parameter exclude_train: exclude training events from predictions and targets
Infer target labels from TEST_START_TIME (per user) and horizon.
Filter test users/items by _hist_len.
"""
_check_index(event_df, user_df, item_df)
_check_more_inputs(event_df, user_df, item_df)
print("augmenting and data tables")
event_df = _mark_and_trim_holdout(event_df, user_df, horizon)
user_df = _augment_user_hist(user_df, event_df)
item_df = _augment_item_hist(item_df, event_df)
print("marking and cleaning test data")
user_in_test = user_df[
(user_df['_hist_len'] >= min_user_len) &
((user_df['TEST_START_TIME'] < np.inf) | test_incl_users_with_posinf_test_time) &
((user_df['TEST_START_TIME'] > -np.inf) | test_incl_users_with_neginf_test_time)
].copy()
item_in_test = item_df[
item_df['_hist_len'] >= min_item_len
].copy()
target_csr = create_matrix(
event_df[event_df['_holdout'] == 1].copy(),
user_in_test.index, item_in_test.index
)
training_data = argparse.Namespace(
event_df=event_df[event_df['_holdout'] == 0].copy(),
user_df=user_df, item_df=item_df
)
if exclude_train:
print("optionally excluding training events in predictions and targets")
assert prior_score is None, "double configuration for prior score"
exclude_csr = create_matrix(
event_df[event_df['_holdout'] == 0].copy(),
user_in_test.index, item_in_test.index
).astype(bool)
prior_score = exclude_csr * -1e10 # clip -inf to avoid nan
mask_csr = target_csr.astype(bool) > exclude_csr.astype(bool)
target_csr = target_csr.multiply(mask_csr)
target_csr.eliminate_zeros()
D = Dataset(target_csr, user_in_test, item_in_test, training_data,
horizon, prior_score)
print("Dataset created!")
return D