data_extraction_transformation/scripts/extract-timeseries.py (210 lines of code) (raw):

import os import requests from datetime import datetime, timedelta import pandas as pd import json import random from helper import append_strings, get_json, txt_to_list import argparse ''' The following function extracts a list of strings associated with a given attribute and it appends them in a pipe-separated string ''' def extract_from_list(lst, attr): the_list = "" try: if (len(lst[attr]) > 0): the_list = "|" for item in lst[attr]: the_list = the_list + str(item) + '|' return the_list except: return "" ''' this function extracts signature attributes and returns them given a signature JSON ''' def extract_sig_attr(signature_json): summary_entry = { "signature_id": signature_json["id"], "framework_id": signature_json["framework_id"], "signature_hash": signature_json["signature_hash"], "option_collection_hash": signature_json["option_collection_hash"], "machine_platform": signature_json["machine_platform"], "suite": signature_json["suite"], "should_alert": signature_json["should_alert"] } return summary_entry ''' this function extracts signature summary attributes and returns them given a summary JSON ''' def extract_summary(json_summary): summary_entry = { "repository_name": json_summary["repository_name"], "test": json_summary["test"], "lower_is_better": json_summary["lower_is_better"], "name": json_summary["name"], "parent_signature": json_summary["parent_signature"], "repository_id": json_summary["repository_id"], "measurement_unit": json_summary["measurement_unit"], "application": json_summary["application"], "has_subtests": json_summary["has_subtests"], "tags": extract_from_list(json_summary, "tags"), "extra_options": extract_from_list(json_summary, "extra_options") } return summary_entry def extract_data(json_data): data_entry = { "job_id": json_data["job_id"], "entry_id": json_data["id"], "push_timestamp": json_data["push_timestamp"], "value": json_data["value"], "revision": json_data["revision"], "push_id": json_data["push_id"] } return data_entry def extract_timeseries(output_folder, project): global filtered_sig_ids global columns signature_url = "https://treeherder.mozilla.org/api/project/" + project + "/performance/signatures/" signatures_json = get_json(signature_url) cond = lambda x: x[1]["id"] in filtered_sig_ids signatures_json = dict(filter(cond, signatures_json.items())) # The purpose of this code is to avoid re-extracting time series that have already been extracted. # This is necessary because running the script in one go is not feasible due to the long time # required to extract all the time series at once. ''' with open(project + '/buffer.txt', 'a+') as file: lines = file.readlines() ids = [line.strip("| \n") for line in lines] available_keys = [key for key in signatures_json.keys() if key not in ids] min_nb = min(n, len(available_keys)) random_keys = random.sample(available_keys, min_nb) random_dict = {key: signatures_json[key] for key in random_keys} keys_string = "".join([f"|{key}|\n" for key in random_dict.keys()]) file.write(keys_string + "\n") with open('../datasets/' + project + '/buffer.txt', 'a+') as file: file.seek(0) lines = file.readlines() ids = [line.strip("| \n") for line in lines] ids = [id for id in ids if id] file.seek(0, 2) available_keys = [key for key in signatures_json.keys() if key not in ids] available_elems_dict = {key: signatures_json[key] for key in available_keys} keys_string = "".join([f"|{key}|\n" for key in available_elems_dict.keys()]) file.write(keys_string + "\n") file.flush() for signature_id in available_elems_dict: signature_attributes = extract_sig_attr(available_elems_dict[signature_id]) framework_id = available_elems_dict[signature_id]["framework_id"] ''' for signature_id in signatures_json: signature_attributes = extract_sig_attr(signatures_json[signature_id]) framework_id = signatures_json[signature_id]["framework_id"] summary_url = "https://treeherder.mozilla.org/api/performance/summary/?repository=" + project + "&signature=" + str(signature_id) + "&framework=" + str(framework_id) + "&interval=126144000&all_data=true&replicates=false" summaries_json = get_json(summary_url) if (len(summaries_json) > 0): summary_json = summaries_json[0] summary_attributes = extract_summary(summary_json) df = pd.DataFrame(columns=columns) for timeseries_entry in summary_json["data"]: new_row = signature_attributes new_row.update(summary_attributes) data_attributes = extract_data(timeseries_entry) new_row.update(data_attributes) new_row_df = pd.DataFrame(new_row, index=[0]) df = pd.concat([df, new_row_df], ignore_index=True) df.to_csv(output_folder + '/' + project + '/' + signature_id + '_timeseries_data.csv', header=True, mode='w', index=False) def parse_args(): parser = argparse.ArgumentParser(description="Fetch timeseires details from an API and save to a folder of CSV files organized into separate folders per project.") parser.add_argument('-o', '--output-folder', help="Path to the output folder of time series CSV files.") parser.add_argument('-a', '--alerts-file', help="Path to the alerts CSV file.") return parser.parse_args() def main(): global filtered_sig_ids global columns args = parse_args() output_folder = args.output_folder alerts_file = args.alerts_file alerts_df = pd.read_csv(alerts_file) mentionned_projects = alerts_df['alert_summary_repository'].unique().tolist() filtered_sig_ids = alerts_df['signature_id'].unique().tolist() # For reference, these are all of the projects ''' all_porjects = [ "try", "android-components", "application-services", "ash", "birch", "cedar", "ci-admin", "ci-admin-try", "ci-configuration", "ci-configuration-try", "comm-beta", "comm-central", "comm-esr115", "comm-release", "elm", "fenix", "firefox-ios", "firefox-translations-training", "focus-android", "holly", "jamun", "kaios", "kaios-try", "larch", "maple", "mozilla-esr115", "mozilla-release", "mozilla-vpn-client", "mozilla-vpn-client-release", "nss", "nss-try", "oak", "pine", "reference-browser", "servo-auto", "servo-master", "servo-try", "staging-android-components", "staging-fenix", "staging-firefox-translations-training", "staging-focus-android", "taskgraph", "toolchains", "try-comm-central", "webrender" ] ''' ''' The following list contains the columns names of the CSV to be generated through this script ''' columns = [ "repository_name", "signature_id", "framework_id", "signature_hash", "machine_platform", "should_alert", "has_subtests", "extra_options", "tags", "option_collection_hash", "test", "suite", "lower_is_better", "name", "parent_signature", "repository_id", "measurement_unit", "application", "job_id", "entry_id", "push_timestamp", "value", "revision", "push_id" ] for project in mentionned_projects: if not os.path.exists(output_folder + '/' + project): os.makedirs(output_folder + '/' + project, exist_ok=True) extract_timeseries(output_folder, project) if __name__ == "__main__": main()