in recommenders/models/sar/sar_singlenode.py [0:0]
def fit(self, df):
"""Main fit method for SAR.
Args:
df (pandas.DataFrame): User item rating dataframe
"""
# generate continuous indices if this hasn't been done
if self.index2item is None:
self.set_index(df)
logger.info("Collecting user affinity matrix")
if not np.issubdtype(df[self.col_rating].dtype, np.number):
raise TypeError("Rating column data type must be numeric")
# copy the DataFrame to avoid modification of the input
select_columns = [self.col_user, self.col_item, self.col_rating]
if self.time_decay_flag:
select_columns += [self.col_timestamp]
temp_df = df[select_columns].copy()
if self.time_decay_flag:
logger.info("Calculating time-decayed affinities")
temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)
else:
# without time decay use the latest user-item rating in the dataset as the affinity score
logger.info("De-duplicating the user-item counts")
temp_df = temp_df.drop_duplicates(
[self.col_user, self.col_item], keep="last"
)
logger.info("Creating index columns")
# add mapping of user and item ids to indices
temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply(
lambda item: self.item2index.get(item, np.NaN)
)
temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply(
lambda user: self.user2index.get(user, np.NaN)
)
if self.normalize:
self.rating_min = temp_df[self.col_rating].min()
self.rating_max = temp_df[self.col_rating].max()
logger.info("Calculating normalization factors")
temp_df[self.col_unity_rating] = 1.0
if self.time_decay_flag:
temp_df = self.compute_time_decay(
df=temp_df, decay_column=self.col_unity_rating
)
self.unity_user_affinity = self.compute_affinity_matrix(
df=temp_df, rating_col=self.col_unity_rating
)
# affinity matrix
logger.info("Building user affinity sparse matrix")
self.user_affinity = self.compute_affinity_matrix(
df=temp_df, rating_col=self.col_rating
)
# calculate item co-occurrence
logger.info("Calculating item co-occurrence")
item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)
# free up some space
del temp_df
self.item_frequencies = item_cooccurrence.diagonal()
logger.info("Calculating item similarity")
if self.similarity_type is COOCCUR:
logger.info("Using co-occurrence based similarity")
self.item_similarity = item_cooccurrence
elif self.similarity_type is JACCARD:
logger.info("Using jaccard based similarity")
self.item_similarity = jaccard(item_cooccurrence).astype(
df[self.col_rating].dtype
)
elif self.similarity_type is LIFT:
logger.info("Using lift based similarity")
self.item_similarity = lift(item_cooccurrence).astype(
df[self.col_rating].dtype
)
else:
raise ValueError("Unknown similarity type: {}".format(self.similarity_type))
# free up some space
del item_cooccurrence
logger.info("Done training")