in baselines.py [0:0]
def __init__(self, tag: str, cloud_target: int, rep_rate: float):
self.parent_dir = "baselines"
os.makedirs(self.parent_dir, exist_ok=True)
self.tag = tag
my_dir_name = f"{tag}_c{cloud_target}_rep{rep_rate:.3f}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
self.my_dir = os.path.join(self.parent_dir, my_dir_name)
os.makedirs(self.my_dir, exist_ok=True)
self.cloud_target = cloud_target
assert 0 <= cloud_target <= 100, "cloud_target must be in [0, 100]"
self.rep_rate = rep_rate
self.table_size_lookup = self._load_table_size()
self.total_data_size = sum(self.table_size_lookup.values())
self.on_prem_data_size = 0
self.on_prem_capacity = self.total_data_size * (100 - cloud_target) / 100
# header: abstractFingerPrint,db_name,table_name,inputDataSize,outputDataSize
self.workload = self._load_workload()
self.placement = {} # db_table name -> Status
if tag == "rep_x_month":
assert rep_rate in [0.21] # 3M
self.on_prem_data_size = self.total_data_size * rep_rate
assert self.on_prem_data_size <= self.on_prem_capacity, "Not enough capacity for replication"
self.rep_x_month_placement()
elif tag == "rep_rtd":
self.preselect_replication()
self.data_placement_random()
elif tag == "MoiJob":
self.preselect_replication()
df = self.get_moirai_job_distribution()
self.data_placement_by_compute_distribution(df)
#self.volley_placement()
elif tag == "volley_new":
self.preselect_replication()
df = self.get_random_project_distribution()
self.data_placement_by_compute_distribution(df)
else:
raise ValueError(f"Unknown baseline: {tag}")
self.persist_placement()