in recommenders/evaluation/python_evaluation.py [0:0]
def _check_column_dtypes_diversity_serendipity(func):
"""Checks columns of DataFrame inputs
This includes the checks on:
* whether the input columns exist in the input DataFrames
* whether the data types of col_user as well as col_item are matched in the two input DataFrames.
* whether reco_df contains any user_item pairs that are already shown in train_df
* check relevance column in reco_df
* check column names in item_feature_df
Args:
func (function): function that will be wrapped
Returns:
function: Wrapper function for checking dtypes.
"""
@wraps(func)
def check_column_dtypes_diversity_serendipity_wrapper(
train_df,
reco_df,
item_feature_df=None,
item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
col_item_features=DEFAULT_ITEM_FEATURES_COL,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_sim=DEFAULT_SIMILARITY_COL,
col_relevance=None,
*args,
**kwargs
):
"""Check columns of DataFrame inputs
Args:
train_df (pandas.DataFrame): Data set with historical data for users and items they
have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
reco_df (pandas.DataFrame): Recommender's prediction output, containing col_user, col_item,
col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
item_feature_df (pandas.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
It contains two columns: col_item and features (a feature vector).
item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
col_item_features (str): item feature column name.
col_user (str): User id column name.
col_item (str): Item id column name.
col_sim (str): This column indicates the column name for item similarity.
col_relevance (str): This column indicates whether the recommended item is actually
relevant to the user or not.
"""
if not has_columns(train_df, [col_user, col_item]):
raise ValueError("Missing columns in train_df DataFrame")
if not has_columns(reco_df, [col_user, col_item]):
raise ValueError("Missing columns in reco_df DataFrame")
if not has_same_base_dtype(train_df, reco_df, columns=[col_user, col_item]):
raise ValueError("Columns in provided DataFrames are not the same datatype")
if col_relevance is None:
col_relevance = DEFAULT_RELEVANCE_COL
# relevance term, default is 1 (relevant) for all
reco_df = reco_df[[col_user, col_item]]
reco_df[col_relevance] = 1.0
else:
col_relevance = col_relevance
reco_df = reco_df[[col_user, col_item, col_relevance]].astype(
{col_relevance: np.float16}
)
if item_sim_measure == "item_feature_vector":
required_columns = [col_item, col_item_features]
if item_feature_df is not None:
if not has_columns(item_feature_df, required_columns):
raise ValueError("Missing columns in item_feature_df DataFrame")
else:
raise Exception(
"item_feature_df not specified! item_feature_df must be provided "
"if choosing to use item_feature_vector to calculate item similarity. "
"item_feature_df should have columns: " + str(required_columns)
)
# check if reco_df contains any user_item pairs that are already shown in train_df
count_intersection = pd.merge(
train_df, reco_df, how="inner", on=[col_user, col_item]
).shape[0]
if count_intersection != 0:
raise Exception(
"reco_df should not contain any user_item pairs that are already shown in train_df"
)
return func(
train_df=train_df,
reco_df=reco_df,
item_feature_df=item_feature_df,
item_sim_measure=item_sim_measure,
col_user=col_user,
col_item=col_item,
col_sim=col_sim,
col_relevance=col_relevance,
*args,
**kwargs
)
return check_column_dtypes_diversity_serendipity_wrapper