in data_extraction_transformation/scripts/extract-timeseries.py [0:0]
def extract_timeseries(output_folder, project):
global filtered_sig_ids
global columns
signature_url = "https://treeherder.mozilla.org/api/project/" + project + "/performance/signatures/"
signatures_json = get_json(signature_url)
cond = lambda x: x[1]["id"] in filtered_sig_ids
signatures_json = dict(filter(cond, signatures_json.items()))
# The purpose of this code is to avoid re-extracting time series that have already been extracted.
# This is necessary because running the script in one go is not feasible due to the long time
# required to extract all the time series at once.
'''
with open(project + '/buffer.txt', 'a+') as file:
lines = file.readlines()
ids = [line.strip("| \n") for line in lines]
available_keys = [key for key in signatures_json.keys() if key not in ids]
min_nb = min(n, len(available_keys))
random_keys = random.sample(available_keys, min_nb)
random_dict = {key: signatures_json[key] for key in random_keys}
keys_string = "".join([f"|{key}|\n" for key in random_dict.keys()])
file.write(keys_string + "\n")
with open('../datasets/' + project + '/buffer.txt', 'a+') as file:
file.seek(0)
lines = file.readlines()
ids = [line.strip("| \n") for line in lines]
ids = [id for id in ids if id]
file.seek(0, 2)
available_keys = [key for key in signatures_json.keys() if key not in ids]
available_elems_dict = {key: signatures_json[key] for key in available_keys}
keys_string = "".join([f"|{key}|\n" for key in available_elems_dict.keys()])
file.write(keys_string + "\n")
file.flush()
for signature_id in available_elems_dict:
signature_attributes = extract_sig_attr(available_elems_dict[signature_id])
framework_id = available_elems_dict[signature_id]["framework_id"]
'''
for signature_id in signatures_json:
signature_attributes = extract_sig_attr(signatures_json[signature_id])
framework_id = signatures_json[signature_id]["framework_id"]
summary_url = "https://treeherder.mozilla.org/api/performance/summary/?repository=" + project + "&signature=" + str(signature_id) + "&framework=" + str(framework_id) + "&interval=126144000&all_data=true&replicates=false"
summaries_json = get_json(summary_url)
if (len(summaries_json) > 0):
summary_json = summaries_json[0]
summary_attributes = extract_summary(summary_json)
df = pd.DataFrame(columns=columns)
for timeseries_entry in summary_json["data"]:
new_row = signature_attributes
new_row.update(summary_attributes)
data_attributes = extract_data(timeseries_entry)
new_row.update(data_attributes)
new_row_df = pd.DataFrame(new_row, index=[0])
df = pd.concat([df, new_row_df], ignore_index=True)
df.to_csv(output_folder + '/' + project + '/' + signature_id + '_timeseries_data.csv', header=True, mode='w', index=False)