fenix-retrieval/generate_applink_data.py (264 lines of code) (raw):
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
import argparse
import collections
import csv
import datetime
import git
import json
import urllib
import numpy as np
import pathlib
from redo import retry
import requests
import statistics
import tempfile
try:
from urllib.parse import urlencode
from urllib.request import urlopen, urlretrieve
except ImportError:
from urllib import urlencode, urlretrieve
from urllib2 import urlopen
RETRY_SLEEP = 10
AD_QUERY = {
"from": "task",
"where": {
"and": [
{"in": {"repo.branch.name": ["mozilla-central"]}},
{"regex": {"run.name": ".*fenix.*"}},
{"regex": {"run.name": ".*perftest.*"}},
]
},
"select": ["task.artifacts", "action.start_time", "task.id"],
"limit": 100000,
}
DEFAULT_TEST_NAME = "view"
def csv_generation_parser():
"""Parser for the CSV generation script."""
parser = argparse.ArgumentParser(
"Run this tool to build CSVs containing Fenix data from some tasks "
+ "running with the multi-commit paradigm in mozilla-central "
+ "(must have perfherder data)."
)
parser.add_argument(
"-t",
"--test-name",
type=str,
default=DEFAULT_TEST_NAME,
help="The name of the test to get data from (must exist in the task name). "
+ "Defaults to `view`. To get view data before Jul. 31, 2020, use `applink`.",
)
parser.add_argument(
"-d",
"--device",
type=str,
choices=["p2", "g5"],
default="p2",
help="Device to get data from.",
)
parser.add_argument(
"-c",
"--cache-path",
type=str,
default=None,
help="Path to a cache for perfherder artifacts (so you don't re-download them). "
+ "Disabled by default.",
)
parser.add_argument(
"-r",
"--fenix-repo",
type=str,
required=True,
help="Path to a local Fenix github repo.",
)
parser.add_argument(
"-o",
"--output",
type=str,
default=None,
help="Path to the output directory. Defaults to current working directory.",
)
parser.add_argument(
"--try",
action="store_true",
dest="try_data",
default=False,
help="Include data from the try server.",
)
parser.add_argument(
"--replicates",
action="store_true",
default=False,
help="Gather the replicates instead of the medians.",
)
parser.add_argument(
"--median-per-day",
action="store_true",
default=False,
help="Returns a single result per day - the median - instead of per commit runs",
)
return parser
def query_activedata(query_json):
"""Used to run queries on active data."""
active_data_url = "http://activedata.allizom.org/query"
req = urllib.request.Request(active_data_url)
req.add_header("Content-Type", "application/json")
jsondata = json.dumps(query_json)
jsondataasbytes = jsondata.encode("utf-8")
req.add_header("Content-Length", len(jsondataasbytes))
print("Querying Active-data...")
response = urllib.request.urlopen(req, jsondataasbytes)
print("Status:" + str(response.getcode()))
data = json.loads(response.read().decode("utf8").replace("'", '"'))["data"]
return data
def download_file(url, target, retry_sleep=RETRY_SLEEP, attempts=3):
"""Downloads a file, given an URL in the target path.
The function will attempt several times on failures.
"""
def _download_file(url, target):
req = requests.get(url, stream=True, timeout=30)
target_dir = target.parent.resolve()
if str(target_dir) != "":
target_dir.mkdir(exist_ok=True)
with target.open("wb") as f:
for chunk in req.iter_content(chunk_size=1024):
if not chunk:
continue
f.write(chunk)
f.flush()
return target
return retry(
_download_file,
args=(url, target),
attempts=attempts,
sleeptime=retry_sleep,
jitter=0,
)
def build_csv(
fenix_repo,
test_name=DEFAULT_TEST_NAME,
device_name="p2",
output=None,
cache_path=None,
try_data=False,
medians=False,
median_per_day=False,
):
"""Generates a CSV file containing per-commit fenix data
for a given test name.
"""
if not medians and median_per_day:
raise NotImplementedError(
"Please specify either --replicates or --median-per-day. I didn't know\n"
+ "how these would work together so I didn't implement it."
)
if cache_path:
cache_path = pathlib.Path(cache_path)
cache_path.mkdir(parents=True, exist_ok=True)
else:
cache_path = tempfile.mkdtemp()
if output:
output = pathlib.Path(output)
output.mkdir(parents=True, exist_ok=True)
else:
output = pathlib.Path(".")
# Initialize the git directory now before the long steps below
fenix = git.Repo(fenix_repo)
print(f"Generating data for {test_name} on the {device_name} device...")
## Get the AD data
AD_QUERY["where"]["and"].extend(
[
{"regex": {"run.name": ".*%s.*" % test_name}},
{"regex": {"run.name": ".*-%s-.*" % device_name}},
]
)
if try_data:
AD_QUERY["where"]["and"][0]["in"]["repo.branch.name"].append("try")
data = query_activedata(AD_QUERY)
allph = []
for c, artifacts in enumerate(data["task.artifacts"]):
if not artifacts:
continue
for artifact in artifacts:
if not artifact:
continue
if not isinstance(artifact, dict):
continue
ph = artifact["url"].split("/")[-1]
if "perfherder" not in ph or ph.startswith("perfherder"):
continue
allph.append((artifact["url"], ph, ph.split("-")[0], data["task.id"][c]))
## Download the perfherder data and get its commit date
nallph = []
for url, file, rev, taskid in allph:
file = f"{taskid}-{test_name}-{device_name}-{file}"
fp = pathlib.Path(cache_path, file)
if not fp.exists():
print(f"Downloading to {fp}")
download_file(url, fp)
with fp.open() as f:
phd = json.load(f)
# sanity checks
if (
"suites" not in phd
or len(phd["suites"]) == 0
or "value" not in phd["suites"][0]
):
print("Bad data, skipping...")
continue
try:
commitdate = fenix.commit(rev).committed_date
vals = phd["suites"][0].get("subtests", [{}])[0].get("replicates", [])
if medians:
vals = [phd["suites"][0]["value"]]
for val in vals:
nallph.append((commitdate, val, rev))
except ValueError:
# Some commits don't exist which is really weird - I don't
# understand how there's a build for them when we can't find them
# in the fenix repo.
print("Failed to find an actual commit for %s" % rev)
# Sort the data by time
allphs = sorted(nallph, key=lambda x: x[0])
if median_per_day:
allphs = transform_to_median_per_day(allphs)
## Store as a CSV
csvfile_human_readable = pathlib.Path(output, f"{test_name}-{device_name}.csv")
csvfile_raw = pathlib.Path(output, f"{test_name}-{device_name}-raw.csv")
write_csv(csvfile_human_readable, optimize_for_human_readability(allphs))
write_csv(csvfile_raw, allphs)
print(
f"Finished generation. Data contained in {str(csvfile_human_readable)} & {str(csvfile_raw)}"
)
try:
from matplotlib import pyplot as plt
plot_allphs = optimize_for_plotting(allphs)
plt.figure()
plt.plot_date([v[0] for v in plot_allphs], [v[1] for v in plot_allphs])
plt.show()
except ImportError:
print("Skipping print stage, cannot find matplotlib")
return
def transform_to_median_per_day(data):
date_to_iterations = collections.defaultdict(list)
for row in data:
dt = datetime.datetime.fromtimestamp(row[0])
ymd = datetime.datetime(dt.year, dt.month, dt.day)
date_to_iterations[ymd].append(row[1])
out_data = []
for i, (date, times) in enumerate(date_to_iterations.items()):
transformed_row = [date.timestamp(), statistics.median(times), "N/A"]
out_data.append(transformed_row)
return out_data
def write_csv(csvfile, data):
with csvfile.open("w") as f:
writer = csv.writer(f)
writer.writerow(["times", "data", "revision"])
writer.writerows(data)
def optimize_for_human_readability(data):
def transform_row(row):
dt = datetime.datetime.fromtimestamp(row[0])
date_str = dt.strftime("%Y-%m-%d %H:%M:%S")
rounded_timestamp = round(row[1])
abbrev_commit = row[2][:9]
return [date_str, rounded_timestamp, abbrev_commit]
return [transform_row(list(row)) for row in data]
def optimize_for_plotting(data):
import matplotlib.dates
def transform_row(row):
matplot_date = matplotlib.dates.epoch2num(row[0])
return [matplot_date] + row[1:]
return [transform_row(list(row)) for row in data]
if __name__ == "__main__":
args = csv_generation_parser().parse_args()
build_csv(
args.fenix_repo,
test_name=args.test_name,
device_name=args.device,
output=args.output,
cache_path=args.cache_path,
try_data=args.try_data,
medians=not args.replicates,
median_per_day=args.median_per_day,
)