def process_rag_input()

in modules/python/src/datapreprocessing/datacleaner.py [0:0]


    def process_rag_input(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Processes the input DataFrame to prepare it for use with a RAG system.  This includes renaming columns,
        filtering data based on categories and value counts, selecting relevant columns, and removing duplicates.

        Args:
            df (pd.DataFrame): The input DataFrame.

        Returns:
            pd.DataFrame: The processed DataFrame ready for RAG.
        """
        # renaming column name
        df.rename(
            columns={
                "uniq_id": "Id",
                "product_name": "Name",
                "description": "Description",
                "brand": "Brand",
                "attributes": "Specifications",
            },
            inplace=True,
        )
        # filtering clothings for men, women and kids
        filtered_df = df[df["c0_name"] == "Clothing"]
        values_to_filter = ["Women's Clothing", "Men's Clothing", "Kids' Clothing"]
        clothing_filtered_df = filtered_df[
            filtered_df["c1_name"].isin(values_to_filter)
        ]
        # Filter to keep rows where 'c2_name' has count >=10
        c2_filtered_df = self.filter_low_value_count_rows(
            clothing_filtered_df, "c2_name", 10
        )
        # Filter to keep rows where 'c3_name' has count >=10
        c3_filtered_df = self.filter_low_value_count_rows(
            clothing_filtered_df, "c3_name", 10
        )
        # prep RA df with subset of the columns
        rag_df = c3_filtered_df[
            [
                "Id",
                "Name",
                "Description",
                "Brand",
                "image",
                "image_uri",
                "c1_name",
                "Specifications",
            ]
        ]
        # Drop duplicates
        rag_df.drop_duplicates(inplace=True)
        # Replace NaN with None
        rag_df["image_uri"] = df["image_uri"].fillna(value="")
        rag_df["image"] = df["image"].fillna(value="")
        rag_df["Description"] = df["Description"].fillna(value="None")
        return rag_df