def load_workload()

in optimizer.py [0:0]


    def load_workload(self):
        # format: abstractFingerPrint,db_name,table_name,
        # inputDataSize, outputDataSize (in bytes),
        # cputime (in seconds)
        self.df = self.job_data_access_df
        self.df['totalDataSize'] = self.df['inputDataSize'] + self.df['outputDataSize']

        k = self.k
        assert 0 < k <= 1, f"Top {k} jobs do not satisfy 0 < k <= 1"
        if k < 1:
            # Calculate total inputDataSize for each abstractFingerPrint
            abFP_sizes = self.df.groupby('abstractFingerPrint')['totalDataSize'].sum()

            # Sort abFPs by inputDataSize in descending order
            abFP_sizes = abFP_sizes.sort_values(ascending=False)

            # Determine the top k% of abFPs to keep
            top_k_count = int(len(abFP_sizes) * k)
            top_k_abFPs = abFP_sizes.head(top_k_count).index

            # Filter the dataframe to include only the selected top k abFPs
            self.df = self.df[self.df['abstractFingerPrint'].isin(top_k_abFPs)]

            # Calculate the percentage of accesses retained
            total_access_size = abFP_sizes.sum()
            top_k_access_size = abFP_sizes.loc[top_k_abFPs].sum()
            percent_access_size = (top_k_access_size / total_access_size) * 100

            print(f"Top {k * 100:.2f}% of abFPs (# {top_k_count}) "
                  f"contribute {percent_access_size:.2f}% of total read/write accesses")
            self.X_scale = percent_access_size / 100
        else:
            self.X_scale = 1

        assert self.df is not None
        row_num = len(self.df)
        self.abFP_num = self.df['abstractFingerPrint'].nunique()
        self.db_table_num = self.df.groupby(['db_name', 'table_name']).ngroups
        print("# row", row_num)
        print("# db_table in workload", self.db_table_num)
        print("# abFP", self.abFP_num)

        self.c = self.df.groupby('abstractFingerPrint')['cputime'].sum()
        print("c created")