in baselines.py [0:0]
def data_placement_by_compute_distribution(self, df):
df['access_size'] = df['inputDataSize'] + df['outputDataSize']
grouped = df.groupby(['db_name', 'table_name', 'Status'], as_index=False)['access_size'].sum()
grouped_map = {}
for _, row in grouped.iterrows():
key = f"{row['db_name']}.{row['table_name']}"
grouped_map.setdefault(key, {Status.ONPREM: 0, Status.CLOUD: 0})
if row['Status'] == 0:
grouped_map[key][Status.ONPREM] = row['access_size']
else:
grouped_map[key][Status.CLOUD] = row['access_size']
onprem_size = 0
cloud_size = 0
for table_key, traffic in grouped_map.items():
onprem_traffic = traffic[Status.ONPREM]
cloud_traffic = traffic[Status.CLOUD]
table_size = self.table_size_lookup.get(table_key, 0)
if table_key in self.placement or table_size == 0:
continue
if onprem_traffic > cloud_traffic and self.on_prem_data_size + onprem_size + table_size <= self.on_prem_capacity:
self.placement[table_key] = Status.ONPREM
onprem_size += table_size
else:
self.placement[table_key] = Status.CLOUD
cloud_size += table_size
print(f"on-prem new data size: {human_readable_size(onprem_size)}, "
f"cloud new data size: {human_readable_size(cloud_size)}")
self.on_prem_data_size += onprem_size
print(f"on-prem data size: {human_readable_size(self.on_prem_data_size)}")
decisions = self.placement.keys()
for table, table_size in self.table_size_lookup.items():
if table not in decisions:
if table_size + self.on_prem_data_size <= self.on_prem_capacity:
self.placement[table] = Status.ONPREM
self.on_prem_data_size += table_size
else:
self.placement[table] = Status.CLOUD
cloud_size += table_size
print(f"on-prem data size: {human_readable_size(self.on_prem_data_size)}")
print(f"cloud data size: {human_readable_size(cloud_size)}")
print(f"total data size: {human_readable_size(self.on_prem_data_size + cloud_size)}")
print(f"=====================")