pageload-summary/summarize

#!/usr/bin/python3 import argparse import csv import numpy as np import os import pathlib from matplotlib import pyplot as plt from scipy.stats.mstats import gmean def summary_parser(): parser = argparse.ArgumentParser( "This tool can be used to generate a summary of the pageload numbers for a single " "given subtest, i.e. ContenfulSpeedIndex. We provide the summary through a geomean " "and you can also perform a comparison with competing browsers using " "`--compare-browsers`. You must provide data in the CSV format that is returned from " "this query: https://sql.telemetry.mozilla.org/queries/79289" ) parser.add_argument( "data", metavar="CSV_DATA", type=str, help="The data to summarize." ) parser.add_argument( "--compare-browsers", action="store_true", default=False, help="Provide a comparison between the browsers found.", ) parser.add_argument( "--timespan", type=int, default=24, help="Minimum time between each data point in hours.", ) parser.add_argument( "--platforms", nargs="*", default=[ # "linux64-shippable-qr", # "windows10-64-shippable-qr", # "macosx1015-64-shippable-qr" ], help="Platforms to summarize.", ) parser.add_argument( "--output", type=str, default=os.getcwd(), help="This is where the data will be saved in JSON format. If the " "path has a `.json` suffix then we'll use the part immediately " "before it as the file name.", ) return parser def open_csv_data(path): """Opens a CSV data file from a given path.""" rows = [] with path.open() as f: reader = csv.reader(f) for row in reader: rows.append(row) return rows def get_data_ind(data, fieldname): """Returns an index for the requested field.""" for i, entry in enumerate(data[0]): if fieldname in entry: return i return None def organize_data(data, platforms): """Organizes the data into a format that is easier to handle. Ex: data = { "platform1": { "test1": { "extra_options": set(), "tags": set(), "values": { "time": val, ... } }, ... }, ... } """ platform_ind = get_data_ind(data, "platform") test_ind = get_data_ind(data, "suite") extra_ind = get_data_ind(data, "extra_options") tag_ind = get_data_ind(data, "tags") val_ind = get_data_ind(data, "value") time_ind = get_data_ind(data, "push_timestamp") app_ind = get_data_ind(data, "application") org_data = {} for entry in data[1:]: platform = entry[platform_ind] if platforms and platform not in platforms: continue test = entry[test_ind] app = entry[app_ind] extras = entry[extra_ind].split() tags = entry[tag_ind].split() variants = "None" pl_type = "cold" if "warm" not in extras and "cold" not in extras: continue if "live" in extras: continue if "warm" in extras: pl_type = "warm" if "fission" in extras: variants += "fission-" if "webrender" in extras: variants += "webrender" if "nocondprof" in extras: extras.remove("nocondprof") # if "nocondprof" in tags: # tags.remove("nocondprof") if "visual" not in extras: extras.append("visual") # if "visual" not in tags: # tags.append("visual") # if test not in ("amazon", "google-mail", "google-slides", "imgur", "tumblr", "twitch", "twitter"): # continue if variants != "None": print("here") variants = variants.replace("None", "") mod_test_name = f"{test}-{app}" + "-".join(sorted(extras)) test_data = ( org_data.setdefault(platform, {}) .setdefault(app, {}) .setdefault(variants, {}) .setdefault(pl_type, {}) .setdefault(mod_test_name, {}) ) # Make sure we're never mixing data if "extra_options" in test_data: assert test_data["extra_options"] == set(list(extras)) else: test_data["extra_options"] = set(list(extras)) # if "tags" in test_data: # print("awlkhwalkhd") # print(test_data["tags"]) # print(tags) # assert test_data["tags"] == set(list(tags)) # else: # test_data["tags"] = set(list(tags)) test_data.setdefault("values", {}).setdefault(entry[time_ind], []).append( float(entry[val_ind]) ) if not org_data: possible_platforms = set([entry[platform_ind] for entry in data]) raise Exception( "Could not find any requested platforms in the data. Possible choices are: " f"{possible_platforms}" ) return org_data def summarize(data, platforms): org_data = organize_data(data, platforms) summary = {} for platform, apps in org_data.items(): for app, variants in apps.items(): for variant, pl_types in variants.items(): for pl_type, tests in pl_types.items(): platform_summary = {"tests": list(tests.keys()), "values": []} # Get all the push times all_push_times = [] for _, info in tests.items(): print(info) all_push_times.extend(list(info["values"].keys())) all_push_times = list(set(all_push_times)) all_push_times = temporal_aggregation(all_push_times, 24) print(all_push_times) # Get a summary value for each push time summarized_vals = [] tests_per_val = {} prev_time = None prev_test_times = {} for c, times in enumerate(sorted(all_push_times)): vals = {} for time in times: if not prev_time: prev_time = time good = True testsc = [] testsg = [] for test, info in tests.items(): if time not in info["values"]: good = False testsc.append(test) continue if test not in prev_test_times: prev_test_times[test] = time vals.setdefault(test, []).extend(info["values"][time]) testsg.append( ( test, time, prev_test_times[test], np.mean(info["values"][time]), np.mean(info["values"][prev_test_times[test]]), ) ) prev_test_times[test] = time if not good: print( f"Tests which failed and prevent a summary at time {time}:", testsc, ) vals = [np.mean(v) for _, v in vals.items()] summarized_vals.append((times[-1], gmean(np.asarray(vals)))) tests_per_val[str(c)] = { "good": testsg, "bad": testsc, "vals": vals, } prev_time = time """ "wikia-firefox-cold-webrender", "espn-firefox-cold-webrender", "cnn-firefox-cold-webrender", "nytimes-firefox-cold-webrender", "buzzfeed-firefox-cold-webrender", "expedia-firefox-cold-webrender" """ """ "wikia-firefox-cold-webrender", "espn-firefox-cold-webrender", "cnn-firefox-cold-webrender", "nytimes-firefox-cold-webrender", "buzzfeed-firefox-cold-webrender", "expedia-firefox-cold-webrender" """ import json print("hereeee") print(json.dumps(tests_per_val, indent=4)) # # Get the ratios over time # prev_test_times = {} # all_ratios = [] # prev_ratio = np.nan # for time in sorted(all_push_times): # ratios = [] # for test, info in tests.items(): # if info["values"].get(time, None): # if prev_test_times.get(test, None): # ratios.append( # np.mean( # info["values"][time] # ) / np.mean( # info["values"][prev_test_times[test]] # ) # ) # else: # prev_test_times[test] = time # else: # continue # gmean_ratios = gmean(ratios) # if np.isnan(gmean_ratios) and not np.isnan(prev_ratio): # all_ratios.append(prev_ratio) # continue # prev_ratio = gmean_ratios # all_ratios.append(gmean_ratios) # new_ratios = [] # first_good = None # for y in all_ratios: # if not np.isnan(y): # first_good = y # break # for y in all_ratios: # if np.isnan(y) and first_good: # new_ratios.append(first_good) # else: # new_ratios.append(y) # first_good = None # all_ratios = new_ratios # all_ratios = np.asarray(all_ratios) # plt.figure() variant = variant if variant != "None" else "e10s" plt.title(platform + f"\n{app}-{pl_type}-{variant}") # plt.plot(list(((all_ratios-min(all_ratios))/(max(all_ratios)-min(all_ratios)))), label="Ratios geomean") # plt.show() # plt.figure() x = np.asarray( [y for x, y in sorted(summarized_vals, key=lambda x: x[0])] ) times = np.asarray( [x for x, y in sorted(summarized_vals, key=lambda x: x[0])] ) sorted_summary = (x - min(x)) / (max(x) - min(x)) print(sorted_summary) print(platform) print(variant) # break # plt.plot([i for i in range(len(sorted_summary))], [y for x, y in sorted_summary]) plt.plot(x, label="Geomean") plt.legend() plt.show() summary.setdefault(platform, {}).setdefault(variant, {}).setdefault( app, {} )[pl_type] = { "tests": list(tests.keys()), "values-gmean": x, "times": times, } def temporal_aggregation(times, timespan=24): import datetime aggr_times = [] diff = datetime.timedelta(hours=timespan) curr = [] for t in sorted(times)[::-1]: dt = datetime.datetime.strptime(t, "%Y-%m-%d %H:%M") if len(curr) == 0: curr.append(dt) elif curr[0] - dt < diff: curr.append(dt) else: aggr_times.append([c.strftime("%Y-%m-%d %H:%M") for c in curr]) curr = [dt] return aggr_times def main(): args = summary_parser().parse_args() # Check data path and setup output data_path = pathlib.Path(args.data) if not data_path.exists(): raise Exception(f"The given data file doesn't exist: {args.data}") output_folder = pathlib.Path(args.output) output_file = "summary.json" if output_folder.exists() and output_folder.is_file(): print(f"Deleting existing JSON file at: {output_folder}") output_folder.unlink() if not output_folder.exists(): # possible_folder, possible_file = output_folder.parts() if pathlib.Path(output_folder.parts[-1]).suffixes: # A CSV file name was given output_file = output_folder.parts[-1] output_folder = pathlib.Path(*output_folder.parts[:-1]) output_folder.mkdir(parents=True, exist_ok=True) # Open data data = open_csv_data(data_path) results = summarize(data, args.platforms) agr = temporal_aggregation(results, args.timespan) if __name__ == "__main__": main()

pageload-summary/summarize_testing.py (234 lines of code) (raw):