in utility.py [0:0]
def abFP_to_table_group(self, input_path, output_path=None, persist=False):
# Step 1: Read the CSV file
input_df = pd.read_csv(input_path)
# #Sample data for demonstration purposes
# data = {
# 'abstractFingerPrint': ['fp1', 'fp1', 'fp2', 'fp2', 'fp3'],
# 'db_name': ['db1', 'db2', 'db1', 'db2', 'db1'],
# 'table_name': ['table1', 'table2', 'table1', 'table2', 'table1'],
# 'inputDataSize': [10, 20, 30, 40, 90],
# 'cputime': [10, 10, 40, 40, 90] # Just adding some cputime values for completeness
# 'count': [2, 2, 1, 1, 7]
# }
# input_df = pd.DataFrame(data)
# Create a concatenated db.table column
input_df['db_table'] = input_df['db_name'] + '.' + input_df['table_name']
# Create a key for unique db.table combinations for each abstractFingerPrint
input_df['group_key'] = input_df.groupby('abstractFingerPrint')['db_table'].transform(
lambda x: '.'.join(sorted(set(x))))
# Create a mapping for the group_key to a new abstractFingerPrint
mapping = {k: f"newFP{i + 1}" for i, k in enumerate(input_df['group_key'].unique())}
input_df['new_abstractFingerPrint'] = input_df['group_key'].map(mapping)
# Now, aggregate inputDataSize and cputime
agg_df = input_df.groupby(['new_abstractFingerPrint', 'db_name', 'table_name']).agg(
# {'inputDataSize': 'sum',
# 'cputime': 'sum'}
inputDataSize=pd.NamedAgg(column='inputDataSize', aggfunc='sum'),
cputime=pd.NamedAgg(column='cputime', aggfunc='sum'),
count=pd.NamedAgg(column='unique_queryid_count', aggfunc='sum'),
group_key=pd.NamedAgg(column='group_key', aggfunc='first')
).reset_index()
# Drop the intermediary columns if needed
# agg_df.drop(columns=['new_abstractFingerPrint'], inplace=True, errors='ignore')
# Output the DataFrame
# print(agg_df)
# Step 6: Write to a new CSV file
if persist:
self.df_list.append(agg_df)
if output_path: # TODO: cannot work with compare()
# drop 'group_key' column
agg_df.drop(columns=['group_key'], inplace=True, errors='ignore')
agg_df.to_csv(output_path, index=False)