in modules/python/src/datapreprocessing/datacleaner.py [0:0]
def process_rag_input(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Processes the input DataFrame to prepare it for use with a RAG system. This includes renaming columns,
filtering data based on categories and value counts, selecting relevant columns, and removing duplicates.
Args:
df (pd.DataFrame): The input DataFrame.
Returns:
pd.DataFrame: The processed DataFrame ready for RAG.
"""
# renaming column name
df.rename(
columns={
"uniq_id": "Id",
"product_name": "Name",
"description": "Description",
"brand": "Brand",
"attributes": "Specifications",
},
inplace=True,
)
# filtering clothings for men, women and kids
filtered_df = df[df["c0_name"] == "Clothing"]
values_to_filter = ["Women's Clothing", "Men's Clothing", "Kids' Clothing"]
clothing_filtered_df = filtered_df[
filtered_df["c1_name"].isin(values_to_filter)
]
# Filter to keep rows where 'c2_name' has count >=10
c2_filtered_df = self.filter_low_value_count_rows(
clothing_filtered_df, "c2_name", 10
)
# Filter to keep rows where 'c3_name' has count >=10
c3_filtered_df = self.filter_low_value_count_rows(
clothing_filtered_df, "c3_name", 10
)
# prep RA df with subset of the columns
rag_df = c3_filtered_df[
[
"Id",
"Name",
"Description",
"Brand",
"image",
"image_uri",
"c1_name",
"Specifications",
]
]
# Drop duplicates
rag_df.drop_duplicates(inplace=True)
# Replace NaN with None
rag_df["image_uri"] = df["image_uri"].fillna(value="")
rag_df["image"] = df["image"].fillna(value="")
rag_df["Description"] = df["Description"].fillna(value="None")
return rag_df