in modules/python/src/datapreprocessing/datacleaner.py [0:0]
def prep_cat(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Prepares product category information by splitting the 'product_category_tree' column into separate category levels.
Args:
df (pd.DataFrame): The input DataFrame containing the 'product_category_tree' column.
Returns:
pd.DataFrame: The DataFrame with the added category level columns.
"""
df["product_category_tree"] = df["product_category_tree"].apply(
lambda x: self.reformat(x)
)
temp_df = df["product_category_tree"].str.split(">>", expand=True)
max_splits = temp_df.shape[1] # Get the number of columns after splitting
# Create column names dynamically
column_names = [f"c{i}_name" for i in range(max_splits)]
temp_df.columns = column_names
for col in temp_df.columns:
temp_df[col] = temp_df[col].apply(lambda x: x.strip() if x else x)
# concatenating df1 and df2 along rows
df_with_cat = pd.concat([df, temp_df], axis=1)
df_with_cat = df_with_cat.drop("product_category_tree", axis=1)
return df_with_cat