data_extraction_transformation/scripts/mozilla_to_TCPDBench.py (116 lines of code) (raw):

import pandas as pd import json import os import shutil import argparse ''' { "command": "/TCPDBench/execs/python/cpdbench_zero.py -i /TCPDBench/datasets/4405556.json", "dataset": "4405556", "dataset_md5": "311477a9dccc391e2ca6e04e4bf74187", "error": null, "hostname": "0ecb21cdbe86", "parameters": {}, "result": { "cplocations": [], "runtime": 4.76837158203125e-07 }, "script": "/TCPDBench/execs/python/cpdbench_zero.py", "script_md5": "95b65ddd5669b41385966a4aad387118", "status": "SUCCESS" } ''' def write_json(dict_sig, sig_path, conf): path = sig_path + '/' + conf os.makedirs(path, exist_ok=True) with open(path + '/' + conf + '.json', 'w') as file: json.dump(dict_sig, file, indent=4) def process_folder(input_folder, output_folder, folder=None): global problematic_signatures global signatures if folder: root_input = input_folder + '/' + folder else: root_input = input_folder for signature_file in os.listdir(root_input): df = pd.read_csv(root_input + '/' + signature_file, index_col=False) sig = signature_file.split('_')[0] dict_sig = dict() dict_sig['error'] = None dict_sig['command'] = 'no_command' dict_sig['script'] = 'no_script' dict_sig['script_md5'] = 'no_script' dict_sig['hostname'] = "no_host" dict_sig['dataset'] = sig dict_sig['dataset_md5'] = sig + '_md5' dict_sig['status'] = 'SUCCESS' dict_sig['parameters'] = {'method': 'Mozilla'} cplocations = sorted(df[df['alert_summary_status_general'].isin(['TP', 'SP', 'FP'])].index.tolist()) dict_sig['result'] = {'cplocations': cplocations, 'runtime': 0} dict_sig['args'] = {'method': 'Mozilla'} output_path = output_folder + '/' + sig if folder: if sig in signatures: os.makedirs(output_path, exist_ok=True) write_json(dict_sig, output_path, 'best_mozilla') write_json(dict_sig, output_path, 'default_mozilla') else: os.makedirs(output_path, exist_ok=True) write_json(dict_sig, output_path, 'best_mozilla') write_json(dict_sig, output_path, 'default_mozilla') ''' try: dict_sig = dict() dict_sig['error'] = None dict_sig['command'] = 'no_command' dict_sig['script'] = 'no_script' dict_sig['script_md5'] = 'no_script' dict_sig['hostname'] = "no_host" dict_sig['dataset'] = sig dict_sig['dataset_md5'] = sig + '_md5' dict_sig['status'] = 'SUCCESS' dict_sig['parameters'] = {'method': 'Mozilla'} cplocations = sorted(df[df['alert_status_general'].isin(['TP', 'SP', 'FP'])].index.tolist()) dict_sig['result'] = {'cplocations': cplocations, 'runtime': 0} dict_sig['args'] = {'method': 'Mozilla'} output_path = output_folder + '/' + sig if sig in signatures: os.makedirs(output_path, exist_ok=True) write_json(dict_sig, output_path, 'best_mozilla') write_json(dict_sig, output_path, 'default_mozilla') except: problematic_signatures.append(sig) ''' def parse_args(): parser = argparse.ArgumentParser(description="Handpick specific timeseries JSON files and format them into th TCPDBench output to compare them to TCPDBench predictions.") parser.add_argument('-o', '--output-folder', help="Path to the output folder of time series JSON files.") parser.add_argument('-i', '--input-folder', help="Path to the input folder of time series JSON files.") parser.add_argument('-f', '--filtered-singatures-file', help="Path to the CSV file with the signatures to handpick (it has to have a column signature_id).", required=False, default=None) return parser.parse_args() def main(): global problematic_signatures global signatures args = parse_args() input_folder = args.input_folder output_folder = args.output_folder # input_folder = '../datasets-original-annotated-2-aggregated' # output_folder = '../filtered-datasets-original-annotated-2-aggregated-tcpdbench-2' # filtered_signatures_file = "../datasets/more_than_10_alert_summaries_speedometer3_tp6.csv" filtered_signatures_file = args.filtered_singatures_file problematic_signatures = [] projects_folders_mapping = {"autoland": ["autoland1", "autoland2", "autoland3", "autoland4"], "firefox-android": ["firefox-android"], "mozilla-beta": ["mozilla-beta"], "mozilla-release": ["mozilla-release"], "mozilla-central": ["mozilla-central"]} os.makedirs(output_folder, exist_ok=True) signatures = [] if filtered_signatures_file: df = pd.read_csv(filtered_signatures_file) signatures = df['signature_id'].unique().tolist() signatures = list(map(str, signatures)) for project in projects_folders_mapping: for folder in projects_folders_mapping[project]: os.makedirs(output_folder + '/' + folder, exist_ok=True) process_folder(input_folder, output_folder, folder) else: process_folder(input_folder, output_folder) print('####### Problematic signatures #######') for sig in problematic_signatures: print('Signature path:') print(sig) if __name__ == "__main__": main()