def create_dataset()

in src/rime/dataset/base.py [0:0]


def create_dataset(event_df, user_df, item_df, horizon=float("inf"),
                   min_user_len=1, min_item_len=1, prior_score=None, exclude_train=False,
                   test_incl_users_with_posinf_test_time=False,
                   test_incl_users_with_neginf_test_time=True,
                   ):
    """ Create a labeled dataset from 3 related tables and additional configurations.

    :parameter event_df: [USER_ID, ITEM_ID, TIMESTAMP]
    :parameter user_df: [USER_ID (index), TEST_START_TIME]
    :parameter item_df: [ITEM_ID (index)]
    :parameter horizon: extract test window from TIMESTAMP, TEST_START_TIME, and horizon
    :parameter min_user_len: filter out test users with empty histories to avoid biases
    :parameter min_item_len: filter out test items with empty histories to avoid biases
    :parameter prior_score: add a prior score to boost/penalize certain user-item pairs
        in prediction
    :parameter exclude_train: exclude training events from predictions and targets

    Infer target labels from TEST_START_TIME (per user) and horizon.
    Filter test users/items by _hist_len.
    """
    _check_index(event_df, user_df, item_df)
    _check_more_inputs(event_df, user_df, item_df)

    print("augmenting and data tables")
    event_df = _mark_and_trim_holdout(event_df, user_df, horizon)
    user_df = _augment_user_hist(user_df, event_df)
    item_df = _augment_item_hist(item_df, event_df)

    print("marking and cleaning test data")
    user_in_test = user_df[
        (user_df['_hist_len'] >= min_user_len) &
        ((user_df['TEST_START_TIME'] < np.inf) | test_incl_users_with_posinf_test_time) &
        ((user_df['TEST_START_TIME'] > -np.inf) | test_incl_users_with_neginf_test_time)
    ].copy()
    item_in_test = item_df[
        item_df['_hist_len'] >= min_item_len
    ].copy()
    target_csr = create_matrix(
        event_df[event_df['_holdout'] == 1].copy(),
        user_in_test.index, item_in_test.index
    )
    training_data = argparse.Namespace(
        event_df=event_df[event_df['_holdout'] == 0].copy(),
        user_df=user_df, item_df=item_df
    )

    if exclude_train:
        print("optionally excluding training events in predictions and targets")
        assert prior_score is None, "double configuration for prior score"

        exclude_csr = create_matrix(
            event_df[event_df['_holdout'] == 0].copy(),
            user_in_test.index, item_in_test.index
        ).astype(bool)
        prior_score = exclude_csr * -1e10    # clip -inf to avoid nan

        mask_csr = target_csr.astype(bool) > exclude_csr.astype(bool)
        target_csr = target_csr.multiply(mask_csr)
        target_csr.eliminate_zeros()

    D = Dataset(target_csr, user_in_test, item_in_test, training_data,
                horizon, prior_score)
    print("Dataset created!")
    return D