in utility.py [0:0]
def compare_table_groups(self):
if not self.df_list:
print("No data to compare")
return
group_key_sets = [set(self.df_list[i]['group_key'].unique()) for i in range(len(self.df_list))]
last_set = group_key_sets[0]
total_set = last_set
common_set = last_set
print(f"# table groups in df1 {len(last_set)}")
for i in range(1, len(self.df_list)):
group_key_set = group_key_sets[i]
print(f"# table groups in df{i + 1} {len(group_key_set)}")
new_set = group_key_set.difference(last_set)
print(f"# table groups in df{i + 1} not in df{i} {len(new_set)}")
common_set = common_set.intersection(group_key_set)
print(f"# table groups in df1 to df{i + 1} {len(common_set)} in common")
never_seen_set = group_key_set.difference(total_set)
print(f"# table groups never seen before {len(never_seen_set)}")
# how these tables contribute to the total cputime
cputime = self.df_list[i]['cputime'].sum()
cputime_new = self.df_list[i][self.df_list[i]['group_key'].isin(never_seen_set)]['cputime'].sum()
print(f"Total cputime in df{i + 1} {cputime}, cputime of new table groups {cputime_new}, "
f"ratio {cputime_new / cputime * 100:.2f}%")
total_set = total_set.union(group_key_set)
print(f"Total # table groups: {len(total_set)}")