def abFP_to_table_group()

in utility.py [0:0]


    def abFP_to_table_group(self, input_path, output_path=None, persist=False):
        # Step 1: Read the CSV file
        input_df = pd.read_csv(input_path)

        # #Sample data for demonstration purposes
        # data = {
        #     'abstractFingerPrint': ['fp1', 'fp1', 'fp2', 'fp2', 'fp3'],
        #     'db_name': ['db1', 'db2', 'db1', 'db2', 'db1'],
        #     'table_name': ['table1', 'table2', 'table1', 'table2', 'table1'],
        #     'inputDataSize': [10, 20, 30, 40, 90],
        #     'cputime': [10, 10, 40, 40, 90]  # Just adding some cputime values for completeness
        #     'count': [2, 2, 1, 1, 7]
        # }
        # input_df = pd.DataFrame(data)

        # Create a concatenated db.table column
        input_df['db_table'] = input_df['db_name'] + '.' + input_df['table_name']

        # Create a key for unique db.table combinations for each abstractFingerPrint
        input_df['group_key'] = input_df.groupby('abstractFingerPrint')['db_table'].transform(
            lambda x: '.'.join(sorted(set(x))))

        # Create a mapping for the group_key to a new abstractFingerPrint
        mapping = {k: f"newFP{i + 1}" for i, k in enumerate(input_df['group_key'].unique())}
        input_df['new_abstractFingerPrint'] = input_df['group_key'].map(mapping)

        # Now, aggregate inputDataSize and cputime
        agg_df = input_df.groupby(['new_abstractFingerPrint', 'db_name', 'table_name']).agg(
            # {'inputDataSize': 'sum',
            # 'cputime': 'sum'}
            inputDataSize=pd.NamedAgg(column='inputDataSize', aggfunc='sum'),
            cputime=pd.NamedAgg(column='cputime', aggfunc='sum'),
            count=pd.NamedAgg(column='unique_queryid_count', aggfunc='sum'),
            group_key=pd.NamedAgg(column='group_key', aggfunc='first')
        ).reset_index()

        # Drop the intermediary columns if needed
        # agg_df.drop(columns=['new_abstractFingerPrint'], inplace=True, errors='ignore')

        # Output the DataFrame
        # print(agg_df)

        # Step 6: Write to a new CSV file
        if persist:
            self.df_list.append(agg_df)
        if output_path:  # TODO: cannot work with compare()
            # drop 'group_key' column
            agg_df.drop(columns=['group_key'], inplace=True, errors='ignore')
            agg_df.to_csv(output_path, index=False)