in scripts/anonymize.py [0:0]
def anonymize_table(self, input_dir, file_name):
print("processing", os.path.join(input_dir, file_name))
# header
# hive_database_name,hive_table_name,dir_size,datestr,uown_names
df = pd.read_csv(os.path.join(input_dir, file_name))
df = df[df['dir_size']>0] # drop dir_size=0
print("# of tables", len(df))
counter_d = len(self.db)
counter_t = len(self.table)
print("existing", counter_d, counter_t)
# anonymization
for index, row in df.iterrows():
d_string = row['hive_database_name']
t_string = row['hive_table_name']
if d_string not in self.db:
self.db[d_string] = counter_d
counter_d += 1
if t_string not in self.table:
self.table[t_string] = counter_t
counter_t += 1
print("mapping created")
print("now", counter_d, counter_t)
# Apply mappings to anonymize the data
df['hive_database_name'] = df['hive_database_name'].map(self.db)
df['hive_table_name'] = df['hive_table_name'].map(self.table)
# if 'uown_names' column exists, anonymize it
if 'uown_names' in df.columns:
df['uown_names'] = df['uown_names'].apply(lambda x: False if pd.isna(x) or x.strip() == "" else True)
df.to_csv(os.path.join(self.dir_path, 'anonymized_'+file_name), index=False)