pageload-summary/summarize.py (442 lines of code) (raw):
#!/usr/bin/python3
import argparse
import csv
import datetime
import json
import numpy as np
import os
import pathlib
import matplotlib.dates as md
from matplotlib import pyplot as plt
def summary_parser():
parser = argparse.ArgumentParser(
"This tool can be used to generate a summary of the pageload numbers for a single "
"given subtest, i.e. ContenfulSpeedIndex. We provide the summary through a geomean "
"and you can also perform a comparison with competing browsers using "
"`--compare-browsers`. You must provide data in the CSV format that is returned from "
"this query: https://sql.telemetry.mozilla.org/queries/79289"
)
parser.add_argument(
"data", metavar="CSV_DATA", type=str, help="The data to summarize."
)
parser.add_argument(
"--timespan",
type=int,
default=24,
help="Minimum time between each data point in hours.",
)
parser.add_argument(
"--moving-average-window",
type=int,
default=7,
help="Number of days to use for the moving average.",
)
parser.add_argument(
"--by-site",
action="store_true",
default=False,
help="Output summary by site",
)
parser.add_argument(
"--visualize",
action="store_true",
default=False,
help="Show visualizations",
)
parser.add_argument(
"--save-plots",
action="store_true",
default=False,
help="Save visualizations",
)
parser.add_argument(
"--save-directory",
help="Directory to save visualizations",
)
parser.add_argument(
"--platforms",
nargs="*",
default=[],
help="Platforms to summarize. Default is all platforms.",
)
parser.add_argument(
"--platform-pattern",
help="pattern (substring-match) for platforms to summarize. Default is all platforms.",
)
parser.add_argument(
"--start-date",
type=datetime.datetime.fromisoformat,
help="Date to start analysis (inclusive).",
)
parser.add_argument(
"--end-date",
type=datetime.datetime.fromisoformat,
help="Date to end analysis (inclusive).",
)
parser.add_argument(
"--apps",
nargs="*",
default=[],
help="Apps to summarize (default is all). Examples: firefox, chromium, chrome",
)
parser.add_argument(
"--output",
type=str,
default=os.getcwd(),
help="This is where the data will be saved in JSON format. If the "
"path has a `.json` suffix then we'll use the part immediately "
"before it as the file name.",
)
return parser
def open_csv_data(path):
"""Opens a CSV data file from a given path."""
rows = []
with path.open() as f:
reader = csv.reader(f)
for row in reader:
rows.append(row)
return rows
def get_data_ind(data, fieldname):
"""Returns an index for the requested field."""
for i, entry in enumerate(data[0]):
if fieldname in entry:
return i
return None
def organize_data(
data, platforms, platform_pattern, start_date, end_date, apps, by_site=False
):
"""Organizes the data into a format that is easier to handle."""
platform_ind = get_data_ind(data, "platform")
test_ind = get_data_ind(data, "suite")
extra_ind = get_data_ind(data, "extra_options")
tag_ind = get_data_ind(data, "tags")
val_ind = get_data_ind(data, "value")
time_ind = get_data_ind(data, "push_timestamp")
app_ind = get_data_ind(data, "application")
org_data = {}
for entry in data[1:]:
platform = entry[platform_ind]
if platforms and platform not in platforms:
continue
if platform_pattern and platform.find(platform_pattern) == -1:
continue
date = datetime.datetime.fromisoformat(entry[time_ind])
if start_date != None and date < start_date:
continue
if end_date != None and date > end_date:
continue
test = entry[test_ind]
app = entry[app_ind]
if apps and app not in apps:
continue
extras = entry[extra_ind].split()
tags = entry[tag_ind].split()
variants = "e10s"
pl_type = "cold"
# Without this, we might start pulling in data
# from mozperftest tests
if "warm" not in extras and "cold" not in extras:
continue
# Make sure we always ignore live site data
if "live" in extras:
continue
# Make sure we always ignore profiler runs
if "gecko-profile" in extras:
continue
if "warm" in extras:
pl_type = "warm"
if app not in ("chrome", "chromium"):
if "fission" in extras:
variants += "fission-"
if "webrender" in extras:
variants += "webrender"
# Newer data no longer has the nocondprof option
if "nocondprof" in extras:
extras.remove("nocondprof")
# Older data didn't have this flag
if "visual" not in extras:
extras.append("visual")
if variants != "e10s":
variants = variants.replace("e10s", "")
if by_site:
platform += "-" + test
mod_test_name = f"{test}-{app}" + "-".join(sorted(extras))
test_data = (
org_data.setdefault(platform, {})
.setdefault(app, {})
.setdefault(variants, {})
.setdefault(pl_type, {})
.setdefault(mod_test_name, {})
)
# Make sure we're never mixing data
if "extra_options" in test_data:
assert test_data["extra_options"] == set(list(extras))
else:
test_data["extra_options"] = set(list(extras))
test_data.setdefault("values", {}).setdefault(entry[time_ind], []).append(
float(entry[val_ind])
)
if not org_data:
possible_platforms = set([entry[platform_ind] for entry in data])
raise Exception(
"Could not find any requested platforms in the data. Possible choices are: "
f"{possible_platforms}"
)
return org_data
def geo_mean(iterable):
a = np.array(iterable)
return a.prod() ** (1.0 / len(a))
def temporal_aggregation(times, timespan=24):
"""Aggregates times formatted like `YYYY-mm-dd HH:MM`.
After aggregation, the result will contain lists of all
points that were grouped together. Timespan distancing
starts from the newest data point.
"""
aggr_times = []
diff = datetime.timedelta(hours=timespan)
curr = []
for t in sorted(times)[::-1]:
dt = datetime.datetime.strptime(t, "%Y-%m-%d %H:%M")
if len(curr) == 0:
curr.append(dt)
elif curr[0] - dt < diff:
# If we are within the `timespan` window, merge the point
curr.append(dt)
else:
aggr_times.append([c.strftime("%Y-%m-%d %H:%M") for c in curr])
curr = [dt]
if len(curr) >= 0 and len(aggr_times) == 0:
# When there's a single data point, there's nothing to aggregate temporally
aggr_times.append([c.strftime("%Y-%m-%d %H:%M") for c in curr])
return aggr_times[::-1]
def summarize(
data,
platforms,
platform_pattern,
timespan,
moving_average_window,
start_date,
end_date,
by_site,
apps,
):
org_data = organize_data(
data, platforms, platform_pattern, start_date, end_date, apps, by_site
)
summary = {}
for platform, apps in org_data.items():
for app, variants in apps.items():
for variant, pl_types in variants.items():
for pl_type, tests in pl_types.items():
# Get all the push times and aggregate them
all_push_times = []
for _, info in tests.items():
all_push_times.extend(list(info["values"].keys()))
all_push_times = temporal_aggregation(
list(set(all_push_times)), timespan
)
if len(all_push_times) <= 1:
print(
"Skipping tests for the following combination "
"as there is <=1 data point: %s"
% "-".join([platform, app, variant, pl_type])
)
continue
# Get a summary value for each push time
summarized_vals = []
for times in sorted(all_push_times):
vals = {}
for time in times:
for test, info in tests.items():
if time not in info["values"]:
continue
vals.setdefault(test, []).extend(info["values"][time])
vals = [np.mean(v) for _, v in vals.items()]
summarized_vals.append((times[-1], geo_mean(np.asarray(vals))))
ma_vals = []
window = []
time_window = []
startdate = datetime.datetime.fromisoformat(summarized_vals[0][0])
enddate = datetime.datetime.fromisoformat(summarized_vals[-1][0])
if (enddate - startdate).days > moving_average_window:
for time, val in summarized_vals:
window.append(val)
time_window.append(time)
startdate = datetime.datetime.fromisoformat(time_window[0])
enddate = datetime.datetime.fromisoformat(time)
if (enddate - startdate).days > moving_average_window:
ma_vals.append((time, np.mean(window)))
window = window[1:]
time_window = time_window[1:]
else:
ma_vals = summarized_vals
if len(ma_vals) == 0:
continue
summary.setdefault(platform, {}).setdefault(app, {}).setdefault(
variant, {}
)[pl_type] = {
"values": summarized_vals,
"moving_average": ma_vals,
}
return summary
def text_summary(summary, width=20, plat_width=50):
"""Outputs the two newest points of the summary as a table.
Returns the results as a list that could be saved to a CSV file.
Ex:
Platform | App | Variant | Type | 04/12/2021 | 04/13/2021
-------------------------------------------------------------------------------------
linux64-shippable | firefox | e10s | cold | 1900 | 1850
| | | warm | 800 | 750
| -------------------------------------------------------
| | webrender | cold | |
| | | warm | |
| -------------------------------------------------------
| | fission | cold |
| | | warm |
| ------------------------------------------
| | fission-webrender | cold |
| | | warm |
------------------------------------------------------
| chrome | | |
------------------------------------------------------
| chromium | | |
------------------------------------------------------------------------
"""
csv_lines = []
lines = []
# Get the two newest data points, for tests without data at those points
# we'll take the two newest data points they have regardless of date.
all_times = []
for platform, apps in summary.items():
for app, variants in apps.items():
for variant, pl_types in variants.items():
for pl_type, data in pl_types.items():
# if len(data.get("moving_average", [])) > 0:
all_times.append(data["moving_average"][-1][0])
sorted_times = sorted(list(set(all_times)))
newest_point = sorted_times[-1]
previous_point = newest_point
if len(sorted_times) > 1:
previous_point = sorted_times[-2]
format_line = (
"{:<{plat_width}}| {:<{width}}| {:<{width}}| {:<10}| {:<{width}}| {:<{width}}"
)
header_line = format_line.format(
"Platform",
"Application",
"Variant",
"Type",
previous_point,
newest_point,
width=width,
plat_width=plat_width,
)
table_len = len(header_line)
lines.append(header_line)
lines.append("-" * table_len)
csv_lines.append(
["Platform", "Application", "Variant", "Type", previous_point, newest_point]
)
platform_output = False
app_output = False
variant_output = False
for platform, apps in sorted(summary.items()):
if platform_output:
lines.append("-" * table_len)
if len(platform) >= plat_width:
platform = platform[: plat_width - 1]
platform_output = False
app_output = False
variant_output = False
for app, variants in sorted(apps.items(), reverse=1):
if app_output:
spacer = width * 2
lines.append(" " * spacer + "-" * (table_len - spacer))
app_output = False
variant_output = False
for variant, pl_types in sorted(variants.items(), reverse=1):
if app in ("chrome", "chromium"):
variant = ""
if variant_output:
spacer = width * 3
lines.append(" " * spacer + "-" * (table_len - spacer))
variant_output = False
for pl_type, data in pl_types.items():
platform_str = platform
app_str = app
variant_str = variant
if platform_output:
platform_str = ""
if variant_output:
variant_str = ""
if app_output:
app_str = ""
cur = np.round(data["moving_average"][-1][1], 2)
prev = cur
if len(data["moving_average"]) > 1:
prev = np.round(data["moving_average"][-2][1], 2)
if prev > 0.0:
delta = f" ({np.round(cur/prev,4)})"
else:
delta = " (NaN)"
lines.append(
format_line.format(
platform_str,
app_str,
variant_str,
pl_type,
prev,
str(cur) + delta,
width=width,
plat_width=plat_width,
)
)
csv_lines.append([platform, app, variant, pl_type, prev, cur])
if not variant_output:
variant_output = True
if not app_output:
app_output = True
if not platform_output:
platform_output = True
for line in lines:
print(line)
return csv_lines
def visual_summary(summary, save=False, directory=None):
for platform, apps in sorted(summary.items()):
for app, variants in sorted(apps.items(), reverse=1):
plt.figure(figsize=(10, 10))
plt.suptitle(platform + f" {app}")
for variant, pl_types in sorted(variants.items(), reverse=1):
"""
This is a simple visualization to show the metric. It
can be modified to anything.
"""
figc = 1
for pl_type, data in pl_types.items():
plt.subplot(1, 2, figc)
figc += 1
variant = variant if variant != "None" else "e10s"
plt.title(f"{pl_type}")
times = [
datetime.datetime.strptime(x, "%Y-%m-%d %H:%M")
for x, y in data["values"]
]
vals = [y for x, y in data["values"]]
ma_times = [
datetime.datetime.strptime(x, "%Y-%m-%d %H:%M")
for x, y in data["moving_average"]
]
ma_vals = [y for x, y in data["moving_average"]]
md_times = md.date2num(times)
md_ma_times = md.date2num(ma_times)
ax = plt.gca()
xfmt = md.DateFormatter("%Y-%m-%d %H:%M:%S")
ax.xaxis.set_major_formatter(xfmt)
plt.xticks(rotation=25)
plt.plot(md_times, vals, label=variant)
plt.plot(md_ma_times, ma_vals, label=variant + " (avg)")
plt.legend()
if save:
if directory != None:
if directory[-1] != "/":
directory += "/"
dest = directory + platform + ".png"
else:
dest = platform + ".png"
plt.savefig(dest)
plt.close()
else:
plt.show()
def main():
args = summary_parser().parse_args()
# Check data path and setup output
data_path = pathlib.Path(args.data)
if not data_path.exists():
raise Exception(f"The given data file doesn't exist: {args.data}")
output_folder = pathlib.Path(args.output)
output_file = "summary.json"
if output_folder.exists() and output_folder.is_file():
print(f"Deleting existing JSON file at: {output_folder}")
output_folder.unlink()
if not output_folder.exists():
if pathlib.Path(output_folder.parts[-1]).suffixes:
# A JSON file name was given
output_file = output_folder.parts[-1]
output_folder = pathlib.Path(*output_folder.parts[:-1])
output_folder.mkdir(parents=True, exist_ok=True)
# Process the data and visualize the results (after saving)
data = open_csv_data(data_path)
results = summarize(
data,
args.platforms,
args.platform_pattern,
args.timespan,
args.moving_average_window,
args.start_date,
args.end_date,
args.by_site,
args.apps,
)
with pathlib.Path(output_folder, output_file).open("w") as f:
json.dump(results, f)
csv_lines = text_summary(results)
csv_file = pathlib.Path(output_folder, "newest-points.csv")
if csv_file.exists():
print(f"Deleting existing CSV summary file at: {csv_file}")
csv_file.unlink()
with csv_file.open("w") as f:
writer = csv.writer(f, delimiter=",")
for line in csv_lines:
writer.writerow(line)
if args.visualize:
visual_summary(results, args.save_plots, args.save_directory)
if __name__ == "__main__":
main()