in scripts/anonymize.py [0:0]
def anonymize_workload(self, input_dir, file_name):
print("processing", os.path.join(input_dir, file_name))
# header
# abstractFingerPrint, db_name, table_name, inputDataSize, cputime
df = pd.read_csv(os.path.join(input_dir, file_name))
# adjust abstractfingerprint into abstractFingerPrint
if 'abstractfingerprint' in df.columns:
df.rename(columns={'abstractfingerprint': 'abstractFingerPrint'}, inplace=True)
counter_a, counter_d, counter_t = self.get_counters()
# anonymization
for index, row in df.iterrows():
a_string = row['abstractFingerPrint']
d_string = row['db_name']
t_string = row['table_name']
if a_string not in self.abFP:
self.abFP[a_string] = counter_a
counter_a += 1
if d_string not in self.db:
self.db[d_string] = counter_d
counter_d += 1
if t_string not in self.table:
self.table[t_string] = counter_t
counter_t += 1
print("mapping created")
print("now", counter_a, counter_d, counter_t)
# Apply mappings to anonymize the data
df['abstractFingerPrint'] = df['abstractFingerPrint'].map(self.abFP)
df['db_name'] = df['db_name'].map(self.db)
df['table_name'] = df['table_name'].map(self.table)
# if 'uown_names' column exists, anonymize it
if 'uown_names' in df.columns:
df['uown_names'] = df['uown_names'].apply(lambda x: False if pd.isna(x) or x.strip() == "" else True)
# Generate a new CSV file with anonymized data
anonymized_file_name = "anonymized_" + file_name
df.to_csv(os.path.join(self.dir_path, anonymized_file_name), index=False)