in recommenders/datasets/pandas_df_utils.py [0:0]
def transform(self, df):
"""Tranform an input dataset with the same schema (column names and dtypes) to libffm format
by using the fitted converter.
Args:
df (pandas.DataFrame): input Pandas dataframe.
Return:
pandas.DataFrame: Output libffm format dataframe.
"""
if self.col_rating not in df.columns:
raise ValueError(
"Input dataset does not contain the label column {} in the fitting dataset".format(
self.col_rating
)
)
if not all([x in df.columns for x in self.field_names]):
raise ValueError(
"Not all columns in the input dataset appear in the fitting dataset"
)
# Encode field-feature.
idx = 1
self.field_feature_dict = {}
for field in self.field_names:
for feature in df[field].values:
# Check whether (field, feature) tuple exists in the dict or not.
# If not, put them into the key-values of the dict and count the index.
if (field, feature) not in self.field_feature_dict:
self.field_feature_dict[(field, feature)] = idx
if df[field].dtype == object:
idx += 1
if df[field].dtype != object:
idx += 1
self.field_count = len(self.field_names)
self.feature_count = idx - 1
def _convert(field, feature, field_index, field_feature_index_dict):
field_feature_index = field_feature_index_dict[(field, feature)]
if isinstance(feature, str):
feature = 1
return "{}:{}:{}".format(field_index, field_feature_index, feature)
for col_index, col in enumerate(self.field_names):
df[col] = df[col].apply(
lambda x: _convert(col, x, col_index + 1, self.field_feature_dict)
)
# Move rating column to the first.
column_names = self.field_names[:]
column_names.insert(0, self.col_rating)
df = df[column_names]
if self.filepath is not None:
np.savetxt(self.filepath, df.values, delimiter=" ", fmt="%s")
return df