def main()

in data_extraction_transformation/scripts/transform-data.py [0:0]


def main():
    global problematic_signatures
    global cutoff_date_time
    global df_alerts
    global category_mapping
    global alert_summary_status_mapping
    global alert_status_mapping
    args = parse_args()
    input_folder = args.input_folder
    output_folder = args.output_folder
    alerts_file = args.alerts_file
    alert_summary_status_mapping = {
        0: "untriaged",
        1: "downstream",
        2: "reassigned",
        3: "invalid",
        4: "improvement",
        5: "investigating",
        6: "wontfix",
        7: "fixed",
        8: "backedout"
    }

    alert_status_mapping = {
        0: "untriaged",
        1: "downstream",
        2: "reassigned",
        3: "invalid",
        4: "acknowledged"
    }

    category_mapping = {
        'investigating': 'SP',
        'reassigned': 'TP',
        'invalid': 'FP',
        'improvement': 'TP',
        'fixed': 'TP',
        'wontfix': 'TP',
        'untriaged': 'SP',
        'backedout': 'TP',
        'downstream': 'TP',
        'acknowledged': 'TP',
    }
    problematic_signatures = []

    # The following usage projects_folders_mapping in case the names of the subfolders does not reflect the names of the projects. The code is designed to handle this change.
    #projects_folders_mapping = {"autoland": ["autoland1", "autoland2", "autoland3", "autoland4"], "firefox-android": ["firefox-android"], "mozilla-beta": ["mozilla-beta"], "mozilla-release": ["mozilla-release"], "mozilla-central": ["mozilla-central"]}

    projects_folders_mapping = {name: [name] for name in os.listdir(input_folder) if os.path.isdir(os.path.join(input_folder, name))}

    df_alerts = pd.read_csv(alerts_file, index_col=False)
    # df_alerts['push_timestamp'] = pd.to_datetime(df_alerts['push_timestamp'], format='%Y-%m-%dT%H:%M:%S', errors='coerce')
    df_alerts['push_timestamp'] = pd.to_datetime(df_alerts['push_timestamp'], unit="s")
    cutoff_date_time = df_alerts['push_timestamp'].max()
    df_alerts = df_alerts.drop(columns=['push_timestamp'])
    df_alerts['alert_summary_status_general'] = df_alerts['alert_summary_status'].map(alert_summary_status_mapping)
    df_alerts["alert_summary_status_general"] = df_alerts["alert_summary_status_general"].replace(category_mapping)

    os.makedirs(output_folder, exist_ok=True)
    for project in projects_folders_mapping:
        for folder in projects_folders_mapping[project]:
            os.makedirs(output_folder + '/' + folder, exist_ok=True)
            process_folder(input_folder, output_folder, folder)
            # shutil.rmtree('../datasets/' + folder)
            # os.rename('../datasets/' + folder + "-processed", '../datasets/' + folder)

    print('####### Problematic signatures #######')
    for sig in problematic_signatures:
        print('Signature path:')
        print(sig)