in optimizer.py [0:0]
def update_previous_placement(self, file_path):
self.previous_placement_path = file_path
# header: table,z,w,size
prev_placement_df = pd.read_csv(file_path)
prev_placement_df['table'] = prev_placement_df['table'].astype(str)
prev_placement_df, _ = merge_similar_rows(prev_placement_df, self.yugong)
self.prev_z = len(self.unique_db_tables) * [-1]
self.prev_w = len(self.unique_db_tables) * [-1]
for idx, row in prev_placement_df.iterrows():
db_table = row['table']
j = self.unique_db_tables[db_table]
assert j is not None and j < len(
prev_placement_df), f"idx {idx}, {db_table}: j={j} >= {len(prev_placement_df)}"
self.prev_z[j] = int(row['z'])
self.prev_w[j] = int(row['w'])
db_tables_list = list(self.unique_db_tables.keys())
count = 0
print("New tables", len(self.unique_db_tables) - len(self.prev_z))
"""
if self.yugong:
self.df_table_size['project'] = self.df_table_size.apply(
lambda row: self.ownership.get_table_ownership(f"{row['hive_database_name']}.{row['hive_table_name']}"), axis=1)
missing_sizes = self.df_table_size[self.df_table_size.apply(
lambda row: f"{row['hive_database_name']}.{row['hive_table_name']}" in extra_db_tables, axis=1)]
grouped_sizes = missing_sizes.groupby('project')['dir_size'].sum()
group_num = len(grouped_sizes)
print("# of grouped untouched projects this time period", group_num, flush=True, end=' ')
for project in grouped_sizes.index:
j_string = f"{project}.group"
self.ownership.add_table_ownership(j_string, "Root|" + project)
assert self.ownership.get_table_ownership(j_string) == project, f"Ownership not set for {j_string}"
"""
for j in range(len(prev_placement_df), len(self.unique_db_tables)):
key = db_tables_list[j]
db_name, table_name = key.split('.')
if self.yugong:
project = self.ownership.get_table_ownership(key)
db_group = f"{project}.group"
else:
db_group = f"{db_name}.group"
if db_group in self.unique_db_tables:
idx = self.unique_db_tables[db_group]
self.prev_z[j] = self.prev_z[idx]
self.prev_w[j] = self.prev_w[idx]
count += 1
else:
#print(f"Warning: {db_group} not found")
self.prev_z[j] = 0
self.prev_w[j] = 0
# debug
for i in range(len(self.prev_z)):
if self.prev_z[i] == -1:
print("Warning: prev_z not updated", i, end=' ')
for key in self.unique_db_tables:
if self.unique_db_tables[key] == i:
print(key)
print("Updated previous placement", count, "times from grouped dbs")