def anonymize_job_Spark()

in scripts/anonymize.py [0:0]


    def anonymize_job_Spark(self, input_dir, file_name):
        print("processing", os.path.join(input_dir, file_name))
        # header: job_id,start_time,end_time,cputime,db_name,table_name,
        # inputDataSize,outputDataSize,uown_names,MB_days_per_day,template_id,
        # date,read_op_count,write_op_count,db_table,total_io,duration
        df = pd.read_csv(os.path.join(input_dir, file_name))
        df.drop(columns=['end_time', 'MB_days_per_day', 'read_op_count', 'write_op_count',
                         'total_io', 'db_table'], inplace=True)

        counter_a, counter_d, counter_t = self.get_counters()
        for index, row in df.iterrows():
            a_string = row['template_id'] if pd.notna(row['template_id']) else None
            d_string = row['db_name']
            t_string = row['table_name']
            if a_string is not None and a_string not in self.abFP:
                self.abFP[a_string] = counter_a
                counter_a += 1
            if d_string not in self.db:
                self.db[d_string] = counter_d
                counter_d += 1
            if t_string not in self.table:
                self.table[t_string] = counter_t
                counter_t += 1
        print("mapping updated into", counter_a, counter_d, counter_t)

        # Apply mappings to anonymize the data
        df['template_id'] = df['template_id'].apply(lambda x: self.abFP.get(x) if pd.notna(x) else -1)
        df['db_name'] = df['db_name'].map(self.db)
        df['table_name'] = df['table_name'].map(self.table)
        if 'uown_names' in df.columns:
            df['uown_names'] = df['uown_names'].apply(lambda x: False if pd.isna(x) or x.strip() == "" else True)

        # Generate a new CSV file with anonymized data
        anonymized_file_name = "anonymized_" + file_name
        df.to_csv(os.path.join(self.dir_path, anonymized_file_name), index=False)