in optimizer.py [0:0]
def load_workload(self):
# format: abstractFingerPrint,db_name,table_name,
# inputDataSize, outputDataSize (in bytes),
# cputime (in seconds)
self.df = self.job_data_access_df
self.df['totalDataSize'] = self.df['inputDataSize'] + self.df['outputDataSize']
k = self.k
assert 0 < k <= 1, f"Top {k} jobs do not satisfy 0 < k <= 1"
if k < 1:
# Calculate total inputDataSize for each abstractFingerPrint
abFP_sizes = self.df.groupby('abstractFingerPrint')['totalDataSize'].sum()
# Sort abFPs by inputDataSize in descending order
abFP_sizes = abFP_sizes.sort_values(ascending=False)
# Determine the top k% of abFPs to keep
top_k_count = int(len(abFP_sizes) * k)
top_k_abFPs = abFP_sizes.head(top_k_count).index
# Filter the dataframe to include only the selected top k abFPs
self.df = self.df[self.df['abstractFingerPrint'].isin(top_k_abFPs)]
# Calculate the percentage of accesses retained
total_access_size = abFP_sizes.sum()
top_k_access_size = abFP_sizes.loc[top_k_abFPs].sum()
percent_access_size = (top_k_access_size / total_access_size) * 100
print(f"Top {k * 100:.2f}% of abFPs (# {top_k_count}) "
f"contribute {percent_access_size:.2f}% of total read/write accesses")
self.X_scale = percent_access_size / 100
else:
self.X_scale = 1
assert self.df is not None
row_num = len(self.df)
self.abFP_num = self.df['abstractFingerPrint'].nunique()
self.db_table_num = self.df.groupby(['db_name', 'table_name']).ngroups
print("# row", row_num)
print("# db_table in workload", self.db_table_num)
print("# abFP", self.abFP_num)
self.c = self.df.groupby('abstractFingerPrint')['cputime'].sum()
print("c created")