data_extraction_transformation/scripts/mozilla_to_TCPDBench.py (116 lines of code) (raw):
import pandas as pd
import json
import os
import shutil
import argparse
'''
{
"command": "/TCPDBench/execs/python/cpdbench_zero.py -i /TCPDBench/datasets/4405556.json",
"dataset": "4405556",
"dataset_md5": "311477a9dccc391e2ca6e04e4bf74187",
"error": null,
"hostname": "0ecb21cdbe86",
"parameters": {},
"result": {
"cplocations": [],
"runtime": 4.76837158203125e-07
},
"script": "/TCPDBench/execs/python/cpdbench_zero.py",
"script_md5": "95b65ddd5669b41385966a4aad387118",
"status": "SUCCESS"
}
'''
def write_json(dict_sig, sig_path, conf):
path = sig_path + '/' + conf
os.makedirs(path, exist_ok=True)
with open(path + '/' + conf + '.json', 'w') as file:
json.dump(dict_sig, file, indent=4)
def process_folder(input_folder, output_folder, folder=None):
global problematic_signatures
global signatures
if folder:
root_input = input_folder + '/' + folder
else:
root_input = input_folder
for signature_file in os.listdir(root_input):
df = pd.read_csv(root_input + '/' + signature_file, index_col=False)
sig = signature_file.split('_')[0]
dict_sig = dict()
dict_sig['error'] = None
dict_sig['command'] = 'no_command'
dict_sig['script'] = 'no_script'
dict_sig['script_md5'] = 'no_script'
dict_sig['hostname'] = "no_host"
dict_sig['dataset'] = sig
dict_sig['dataset_md5'] = sig + '_md5'
dict_sig['status'] = 'SUCCESS'
dict_sig['parameters'] = {'method': 'Mozilla'}
cplocations = sorted(df[df['alert_summary_status_general'].isin(['TP', 'SP', 'FP'])].index.tolist())
dict_sig['result'] = {'cplocations': cplocations, 'runtime': 0}
dict_sig['args'] = {'method': 'Mozilla'}
output_path = output_folder + '/' + sig
if folder:
if sig in signatures:
os.makedirs(output_path, exist_ok=True)
write_json(dict_sig, output_path, 'best_mozilla')
write_json(dict_sig, output_path, 'default_mozilla')
else:
os.makedirs(output_path, exist_ok=True)
write_json(dict_sig, output_path, 'best_mozilla')
write_json(dict_sig, output_path, 'default_mozilla')
'''
try:
dict_sig = dict()
dict_sig['error'] = None
dict_sig['command'] = 'no_command'
dict_sig['script'] = 'no_script'
dict_sig['script_md5'] = 'no_script'
dict_sig['hostname'] = "no_host"
dict_sig['dataset'] = sig
dict_sig['dataset_md5'] = sig + '_md5'
dict_sig['status'] = 'SUCCESS'
dict_sig['parameters'] = {'method': 'Mozilla'}
cplocations = sorted(df[df['alert_status_general'].isin(['TP', 'SP', 'FP'])].index.tolist())
dict_sig['result'] = {'cplocations': cplocations, 'runtime': 0}
dict_sig['args'] = {'method': 'Mozilla'}
output_path = output_folder + '/' + sig
if sig in signatures:
os.makedirs(output_path, exist_ok=True)
write_json(dict_sig, output_path, 'best_mozilla')
write_json(dict_sig, output_path, 'default_mozilla')
except:
problematic_signatures.append(sig)
'''
def parse_args():
parser = argparse.ArgumentParser(description="Handpick specific timeseries JSON files and format them into th TCPDBench output to compare them to TCPDBench predictions.")
parser.add_argument('-o', '--output-folder', help="Path to the output folder of time series JSON files.")
parser.add_argument('-i', '--input-folder', help="Path to the input folder of time series JSON files.")
parser.add_argument('-f', '--filtered-singatures-file', help="Path to the CSV file with the signatures to handpick (it has to have a column signature_id).", required=False, default=None)
return parser.parse_args()
def main():
global problematic_signatures
global signatures
args = parse_args()
input_folder = args.input_folder
output_folder = args.output_folder
# input_folder = '../datasets-original-annotated-2-aggregated'
# output_folder = '../filtered-datasets-original-annotated-2-aggregated-tcpdbench-2'
# filtered_signatures_file = "../datasets/more_than_10_alert_summaries_speedometer3_tp6.csv"
filtered_signatures_file = args.filtered_singatures_file
problematic_signatures = []
projects_folders_mapping = {"autoland": ["autoland1", "autoland2", "autoland3", "autoland4"], "firefox-android": ["firefox-android"], "mozilla-beta": ["mozilla-beta"], "mozilla-release": ["mozilla-release"], "mozilla-central": ["mozilla-central"]}
os.makedirs(output_folder, exist_ok=True)
signatures = []
if filtered_signatures_file:
df = pd.read_csv(filtered_signatures_file)
signatures = df['signature_id'].unique().tolist()
signatures = list(map(str, signatures))
for project in projects_folders_mapping:
for folder in projects_folders_mapping[project]:
os.makedirs(output_folder + '/' + folder, exist_ok=True)
process_folder(input_folder, output_folder, folder)
else:
process_folder(input_folder, output_folder)
print('####### Problematic signatures #######')
for sig in problematic_signatures:
print('Signature path:')
print(sig)
if __name__ == "__main__":
main()