data_extraction_transformation/scripts/transform-data.py (88 lines of code) (raw):

import os import pandas as pd import shutil import argparse def parse_args(): parser = argparse.ArgumentParser(description="Script to cross-reference the data of the time series with the alerts data") parser.add_argument('-i', '--input-folder', required=True, help="Path to the input dataset folder") parser.add_argument('-a', '--alerts-file', required=True, help="Path to the alerts CSV file") parser.add_argument('-o', '--output-folder', required=True, help="Path of the folder ouputting CSV timeseries files") return parser.parse_args() def process_folder(input_folder, output_folder, folder): global problematic_signatures global cutoff_date_time global df_alerts for signature_file in os.listdir(input_folder + '/' + folder): try: df = pd.read_csv(input_folder + '/' + folder + '/' + signature_file, index_col=False) # df = pd.read_csv('../datasets/' + folder + '/' + signature_file, index_col=False)[["job_id", "entry_id", "push_timestamp", "value", "revision", "push_id", "repository_name", "test", "lower_is_better", "name", "parent_signature", "repository_id", "measurement_unit", "application", "has_subtests", "tags", "extra_options", "signature_id", "framework_id" , "signature_hash", "option_collection_hash", "machine_platform", "suite", "should_alert"]] df["push_timestamp"] = pd.to_datetime(df["push_timestamp"], format='%Y-%m-%dT%H:%M:%S', errors='coerce') # df["push_timestamp"] = pd.to_datetime(df['push_timestamp'], format='mixed') df = df[df['push_timestamp'] <= cutoff_date_time] df_merged = pd.merge(df, df_alerts, left_on=['revision', 'signature_id'], right_on=['alert_summary_revision', 'signature_id'], how='left') df_merged['alert_summary_status_general'].fillna('TN', inplace=True) df_final = df_merged.drop_duplicates() df_final.loc[df_final['single_alert_manually_created'] == True, 'alert_summary_status_general'] = "FN" df_final.sort_values(by="push_timestamp", ascending=True) if not df_final['alert_summary_id'].isna().all(): df_final.to_csv(output_folder + '/' + folder + '/' + signature_file, index=False) except: problematic_signatures.append(folder + '/' + signature_file) # input_folder = '../datasets-original' # output_folder = '../datasets-original-annotated-test' # alerts_df_path = '../datasets/2_rectified_alerts_data.csv' def main(): global problematic_signatures global cutoff_date_time global df_alerts global category_mapping global alert_summary_status_mapping global alert_status_mapping args = parse_args() input_folder = args.input_folder output_folder = args.output_folder alerts_file = args.alerts_file alert_summary_status_mapping = { 0: "untriaged", 1: "downstream", 2: "reassigned", 3: "invalid", 4: "improvement", 5: "investigating", 6: "wontfix", 7: "fixed", 8: "backedout" } alert_status_mapping = { 0: "untriaged", 1: "downstream", 2: "reassigned", 3: "invalid", 4: "acknowledged" } category_mapping = { 'investigating': 'SP', 'reassigned': 'TP', 'invalid': 'FP', 'improvement': 'TP', 'fixed': 'TP', 'wontfix': 'TP', 'untriaged': 'SP', 'backedout': 'TP', 'downstream': 'TP', 'acknowledged': 'TP', } problematic_signatures = [] # The following usage projects_folders_mapping in case the names of the subfolders does not reflect the names of the projects. The code is designed to handle this change. #projects_folders_mapping = {"autoland": ["autoland1", "autoland2", "autoland3", "autoland4"], "firefox-android": ["firefox-android"], "mozilla-beta": ["mozilla-beta"], "mozilla-release": ["mozilla-release"], "mozilla-central": ["mozilla-central"]} projects_folders_mapping = {name: [name] for name in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, name))} df_alerts = pd.read_csv(alerts_file, index_col=False) # df_alerts['push_timestamp'] = pd.to_datetime(df_alerts['push_timestamp'], format='%Y-%m-%dT%H:%M:%S', errors='coerce') df_alerts['push_timestamp'] = pd.to_datetime(df_alerts['push_timestamp'], unit="s") cutoff_date_time = df_alerts['push_timestamp'].max() df_alerts = df_alerts.drop(columns=['push_timestamp']) df_alerts['alert_summary_status_general'] = df_alerts['alert_summary_status'].map(alert_summary_status_mapping) df_alerts["alert_summary_status_general"] = df_alerts["alert_summary_status_general"].replace(category_mapping) os.makedirs(output_folder, exist_ok=True) for project in projects_folders_mapping: for folder in projects_folders_mapping[project]: os.makedirs(output_folder + '/' + folder, exist_ok=True) process_folder(input_folder, output_folder, folder) # shutil.rmtree('../datasets/' + folder) # os.rename('../datasets/' + folder + "-processed", '../datasets/' + folder) print('####### Problematic signatures #######') for sig in problematic_signatures: print('Signature path:') print(sig) if __name__ == "__main__": main()