pageload-summary/summarize_old.py (249 lines of code) (raw):

#!/usr/bin/python3 import argparse import csv import numpy as np import os import pathlib from matplotlib import pyplot as plt from scipy.stats.mstats import gmean def summary_parser(): parser = argparse.ArgumentParser( "This tool can be used to generate a summary of the pageload numbers for a single " "given subtest, i.e. ContenfulSpeedIndex. We provide the summary through a geomean " "and you can also perform a comparison with competing browsers using " "`--compare-browsers`. You must provide data in the CSV format that is returned from " "this query: https://sql.telemetry.mozilla.org/queries/79289" ) parser.add_argument( "data", metavar="CSV_DATA", type=str, help="The data to summarize." ) parser.add_argument( "--compare-browsers", action="store_true", default=False, help="Provide a comparison between the browsers found.", ) parser.add_argument( "--platforms", nargs="*", default=[ # "linux64-shippable-qr", # "windows10-64-shippable-qr", # "macosx1015-64-shippable-qr" ], help="Platforms to summarize.", ) parser.add_argument( "--output", type=str, default=os.getcwd(), help="This is where the data will be saved in JSON format. If the " "path has a `.json` suffix then we'll use the part immediately " "before it as the file name.", ) return parser def open_csv_data(path): """Opens a CSV data file from a given path.""" rows = [] with path.open() as f: reader = csv.reader(f) for row in reader: rows.append(row) return rows def get_data_ind(data, fieldname): """Returns an index for the requested field.""" for i, entry in enumerate(data[0]): if fieldname in entry: return i return None def organize_data(data, platforms): """Organizes the data into a format that is easier to handle. Ex: data = { "platform1": { "test1": { "extra_options": set(), "tags": set(), "values": { "time": val, ... } }, ... }, ... } """ platform_ind = get_data_ind(data, "platform") test_ind = get_data_ind(data, "suite") extra_ind = get_data_ind(data, "extra_options") tag_ind = get_data_ind(data, "tags") val_ind = get_data_ind(data, "value") time_ind = get_data_ind(data, "push_timestamp") app_ind = get_data_ind(data, "application") org_data = {} for entry in data[1:]: platform = entry[platform_ind] if platforms and platform not in platforms: continue test = entry[test_ind] app = entry[app_ind] extras = entry[extra_ind].split() tags = entry[tag_ind].split() variants = "None" if "warm" not in extras and "cold" not in extras: continue if "warm" in extras: continue if "live" in extras: continue if "fission" in extras: variants += "fission-" if "webrender" in extras: variants += "webrender" if "nocondprof" in extras: extras.remove("nocondprof") # if "nocondprof" in tags: # tags.remove("nocondprof") if "visual" not in extras: extras.append("visual") # if "visual" not in tags: # tags.append("visual") # if test not in ("amazon", "google-mail", "google-slides", "imgur", "tumblr", "twitch", "twitter"): # continue if variants != "None": variants.lstrip("None") mod_test_name = test + f"-{app}-" + "-".join(sorted(extras)) test_data = ( org_data.setdefault(platform, {}) .setdefault(app, {}) .setdefault(variants, {}) .setdefault(mod_test_name, {}) ) # Make sure we're never mixing data if "extra_options" in test_data: assert test_data["extra_options"] == set(list(extras)) else: test_data["extra_options"] = set(list(extras)) # if "tags" in test_data: # print("awlkhwalkhd") # print(test_data["tags"]) # print(tags) # assert test_data["tags"] == set(list(tags)) # else: # test_data["tags"] = set(list(tags)) test_data.setdefault("values", {}).setdefault(entry[time_ind], []).append( float(entry[val_ind]) ) if not org_data: possible_platforms = set([entry[platform_ind] for entry in data]) raise Exception( "Could not find any requested platforms in the data. Possible choices are: " f"{possible_platforms}" ) return org_data def summarize(data, platforms): org_data = organize_data(data, platforms) summary = {} for platform, variants in org_data.items(): for variant, apps in variants.items(): app_summarized_vals = {} for app, tests in apps.items(): platform_summary = {"tests": list(tests.keys()), "values": []} # Get all the push times all_push_times = [] for _, info in tests.items(): print(info) all_push_times.extend(list(info["values"].keys())) all_push_times = list(set(all_push_times)) print(all_push_times) # Get a summary value for each push time summarized_vals = [] tests_per_val = {} prev_time = None prev_test_times = {} for c, time in enumerate(sorted(all_push_times)): if not prev_time: prev_time = time vals = [] good = True testsc = [] testsg = [] for test, info in tests.items(): if time not in info["values"]: good = False testsc.append(test) continue if test not in prev_test_times: prev_test_times[test] = time vals.append(np.mean(info["values"][time])) testsg.append( ( test, time, prev_test_times[test], np.mean(info["values"][time]), np.mean(info["values"][prev_test_times[test]]), ) ) prev_test_times[test] = time if not good: print( f"Tests which failed and prevent a summary at time {time}:", testsc, ) summarized_vals.append((time, gmean(np.asarray(vals)))) tests_per_val[str(c)] = { "good": testsg, "bad": testsc, "vals": vals, } prev_time = time """ "wikia-firefox-cold-webrender", "espn-firefox-cold-webrender", "cnn-firefox-cold-webrender", "nytimes-firefox-cold-webrender", "buzzfeed-firefox-cold-webrender", "expedia-firefox-cold-webrender" """ """ "wikia-firefox-cold-webrender", "espn-firefox-cold-webrender", "cnn-firefox-cold-webrender", "nytimes-firefox-cold-webrender", "buzzfeed-firefox-cold-webrender", "expedia-firefox-cold-webrender" """ import json print("hereeee") print(json.dumps(tests_per_val, indent=4)) # Get the ratios over time prev_test_times = {} all_ratios = [] prev_ratio = np.nan for time in sorted(all_push_times): ratios = [] for test, info in tests.items(): if info["values"].get(time, None): if prev_test_times.get(test, None): ratios.append( np.mean(info["values"][time]) / np.mean(info["values"][prev_test_times[test]]) ) else: prev_test_times[test] = time else: continue gmean_ratios = gmean(ratios) if np.isnan(gmean_ratios) and not np.isnan(prev_ratio): all_ratios.append(prev_ratio) continue prev_ratio = gmean_ratios all_ratios.append(gmean_ratios) new_ratios = [] first_good = None for y in all_ratios: if not np.isnan(y): first_good = y break for y in all_ratios: if np.isnan(y) and first_good: new_ratios.append(first_good) else: new_ratios.append(y) first_good = None all_ratios = new_ratios all_ratios = np.asarray(all_ratios) plt.figure() plt.title(platform) plt.plot( list( ( (all_ratios - min(all_ratios)) / (max(all_ratios) - min(all_ratios)) ) ), label="Ratios geomean", ) # plt.show() # plt.figure() x = np.asarray( [y for x, y in sorted(summarized_vals, key=lambda x: x[0])] ) times = np.asarray( [x for x, y in sorted(summarized_vals, key=lambda x: x[0])] ) sorted_summary = (x - min(x)) / (max(x) - min(x)) print(sorted_summary) print(platform) # break # plt.plot([i for i in range(len(sorted_summary))], [y for x, y in sorted_summary]) plt.plot(sorted_summary, label="Geomean") plt.legend() plt.show() summary.setdefault(platform, {}).setdefault(variant, {})[app] = { "tests": list(tests.keys()), "values-gmean": sorted_summary, "values-ratio": all_ratios, "times": times, } def main(): args = summary_parser().parse_args() # Check data path and setup output data_path = pathlib.Path(args.data) if not data_path.exists(): raise Exception(f"The given data file doesn't exist: {args.data}") output_folder = pathlib.Path(args.output) output_file = "summary.json" if output_folder.exists() and output_folder.is_file(): print(f"Deleting existing JSON file at: {output_folder}") output_folder.unlink() if not output_folder.exists(): # possible_folder, possible_file = output_folder.parts() if pathlib.Path(output_folder.parts[-1]).suffixes: # A CSV file name was given output_file = output_folder.parts[-1] output_folder = pathlib.Path(*output_folder.parts[:-1]) output_folder.mkdir(parents=True, exist_ok=True) # Open data data = open_csv_data(data_path) results = summarize(data, args.platforms) if __name__ == "__main__": main()