in baselines.py [0:0]
def get_random_project_distribution(self):
assert os.path.exists("yugongTraces/report-uown-volume-table-20241022-20241028.csv")
# header: abstractFingerPrint, db_name, table_name, inputDataSize, outputDataSize, cputime
df = pd.read_csv("yugongTraces/report-uown-volume-table-20241022-20241028.csv")
prob_in_cloud = self.cloud_target / 100
# Get unique abstractFingerPrint values
unique_abFPs = df['abstractFingerPrint'].unique()
total_cputime = df['cputime'].sum()
cloud_quota = total_cputime * prob_in_cloud
onprem_quota = total_cputime * (1 - prob_in_cloud)
shuffled_abFPs = np.random.permutation(unique_abFPs)
abFP_status_map = {}
for abFP in shuffled_abFPs:
cputime = df[df['abstractFingerPrint'] == abFP]['cputime'].sum()
if cputime <= cloud_quota:
abFP_status_map[abFP] = 1
cloud_quota -= cputime
else:
abFP_status_map[abFP] = 0
onprem_quota -= cputime
# # Assign Status randomly with probability prob_in_cloud for being 1 (in cloud)
# abFP_status_map = {abFP: np.random.choice([0, 1], p=[1 - prob_in_cloud, prob_in_cloud]) for abFP in
# unique_abFPs}
# Map the assigned Status to the dataframe
df['Status'] = df['abstractFingerPrint'].map(abFP_status_map)
return df