def anonymize

def anonymize_table()

in scripts/anonymize.py [0:0]
24 lines of code
8 McCabe index (conditional complexity)

    def anonymize_table(self, input_dir, file_name):
        print("processing", os.path.join(input_dir, file_name))
        # header
        # hive_database_name,hive_table_name,dir_size,datestr,uown_names
        df = pd.read_csv(os.path.join(input_dir, file_name))
        df = df[df['dir_size']>0] # drop dir_size=0
        print("# of tables", len(df))

        counter_d = len(self.db)
        counter_t = len(self.table)

        print("existing", counter_d, counter_t)

        # anonymization
        for index, row in df.iterrows():
            d_string = row['hive_database_name']
            t_string = row['hive_table_name']
            if d_string not in self.db:
                self.db[d_string] = counter_d
                counter_d += 1
            if t_string not in self.table:
                self.table[t_string] = counter_t
                counter_t += 1
        print("mapping created")

        print("now", counter_d, counter_t)

        # Apply mappings to anonymize the data
        df['hive_database_name'] = df['hive_database_name'].map(self.db)
        df['hive_table_name'] = df['hive_table_name'].map(self.table)

        # if 'uown_names' column exists, anonymize it
        if 'uown_names' in df.columns:
            df['uown_names'] = df['uown_names'].apply(lambda x: False if pd.isna(x) or x.strip() == "" else True)

        df.to_csv(os.path.join(self.dir_path, 'anonymized_'+file_name), index=False)