probe_scraper/scrapers/moz_central_scraper.py (167 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import json
import os
import re
from collections import defaultdict
import requests
from ..parsers.utils import HTTP_HEADERS
from .buildhub import Buildhub
BASE_URI = "https://hg.mozilla.org"
REGISTRY_FILES = {
"histogram": [
"toolkit/components/telemetry/Histograms.json",
"dom/base/UseCounters.conf",
"dom/base/nsDeprecatedOperationList.h",
"servo/components/style/properties/counted_unknown_properties.py",
"devtools/shared/css/generated/properties-db.js",
],
"scalar": [
"toolkit/components/telemetry/Scalars.yaml",
],
"event": [
"toolkit/components/telemetry/Events.yaml",
],
}
CHANNELS = {
"nightly": {
"base_uri": f"{BASE_URI}/mozilla-central",
"tag_regex": "^FIREFOX_(AURORA|BETA)_[0-9]+_BASE$",
"artificial_tags": [
{
"date": [1567362726.0, 0],
"node": "fd2934cca1ae7b492f29a4d240915aa9ec5b4977",
"tag": "FIREFOX_BETA_71_BASE",
}
],
},
"beta": {
"base_uri": f"{BASE_URI}/releases/mozilla-beta",
"tag_regex": "^FIREFOX_BETA_[0-9]+_BASE$",
},
"release": {
"base_uri": f"{BASE_URI}/releases/mozilla-release",
"tag_regex": "^FIREFOX_[0-9]+_0_RELEASE$",
},
}
SKIP_REVISIONS = {
"942c201b1ac7a46a449f1fb80da7b050ec0ea120",
"1807a36ff99f01abca1c37442fb5b344465bfbdf",
"30bdee9799a07b8770719aa868416174ff0c54f5",
"9fb70b4ae59336b805a1651e7c57c6385cca0717",
"81578db6bf8939678d490b69f0daf4b675027e3a",
"b8567457ece9593ddb00344130597698145bdc5c",
"c4bdea458a08b975ffd70faed4a2f6fbe1e563bc",
"d420f9190e2f35e314aa67ee346650f86451792c",
"a680e8cd9618f4afbbb148ad464824cd6ce558d9",
"5cbd3d92a78c54b324b6009a25d196adaa8a669b",
"75c1403f58f79d1abd43d33fdd1beb36db9367c6",
"cafaf813b0a938a197a488e629883770b2d33393",
"cbbf6a7e34a363b39107b60dddac2aa713eaa8b5",
}
MIN_FIREFOX_VERSION = 30
ERROR_CACHE_FILENAME = "probe_scraper_errors_cache.json"
ARTIFICIAL_TAG = "artificial"
def extract_major_version(version_str):
"""
Given a version string, e.g. "62.0a1",
extract the major version as an int.
"""
search = re.search(r"^(\d+)\.", version_str)
if search is not None:
return int(search.group(1))
else:
raise Exception("Invalid version string " + version_str)
def relative_path_is_in_version(rel_path, version):
# The devtools file exists in a bunch of versions, but we only care for it
# since firefox 71 (bug 1578661).
if (
rel_path == "devtools/shared/css/generated/properties-db.js"
or rel_path == "servo/components/style/properties/counted_unknown_properties.py"
):
return version >= 71
return True
def download_files(channel, node, temp_dir, error_cache, version, tree=None):
if tree is None:
uri = CHANNELS[channel]["base_uri"]
else:
# mozilla-release and mozilla-beta need to be prefixed with "release/"
# sometimes they aren't from buildhub, add them if they are missing
if not tree.startswith("releases/") and tree != "mozilla-central":
tree = f"releases/{tree}"
uri = f"{BASE_URI}/{tree}"
base_uri = f"{uri}/raw-file/{node}/"
node_path = os.path.join(temp_dir, "hg", node)
results = {}
def add_result(ptype, disk_path):
if ptype not in results:
results[ptype] = []
results[ptype].append(disk_path)
all_files = [(k, x) for k, l in list(REGISTRY_FILES.items()) for x in l]
for ptype, rel_path in all_files:
disk_path = os.path.join(node_path, rel_path)
if os.path.exists(disk_path):
add_result(ptype, disk_path)
continue
uri = base_uri + rel_path
# requests_cache doesn't cache on error status codes.
# We just use our own cache for these for now.
if uri in error_cache:
continue
if not relative_path_is_in_version(rel_path, int(version)):
continue
req = requests.get(uri, headers=HTTP_HEADERS)
if req.status_code != requests.codes.ok:
if os.path.basename(rel_path) == "Histograms.json":
raise Exception(
"Request returned status " + str(req.status_code) + " for " + uri
)
else:
error_cache[uri] = req.status_code
continue
dir = os.path.split(disk_path)[0]
if not os.path.exists(dir):
os.makedirs(dir)
with open(disk_path, "wb") as f:
for chunk in req.iter_content(chunk_size=128):
f.write(chunk)
add_result(ptype, disk_path)
return results
def load_error_cache(folder):
path = os.path.join(folder, ERROR_CACHE_FILENAME)
if not os.path.exists(path):
return {}
with open(path, "r") as f:
return json.load(f)
def save_error_cache(folder, error_cache):
path = os.path.join(folder, ERROR_CACHE_FILENAME)
with open(path, "w") as f:
json.dump(error_cache, f, sort_keys=True, indent=2, separators=(",", ": "))
def scrape_channel_revisions(
folder=None, min_fx_version=None, max_fx_version=None, channels=None
):
"""
Returns data in the format:
{
<channel>: {
<revision>: {
"date": <date>,
"version": <version>,
"registries": {
"histogram": [path, ...],
"event": [path, ...],
"scalar": [path, ...]
}
}
},
...
}
"""
if min_fx_version is None:
min_fx_version = MIN_FIREFOX_VERSION
error_cache = load_error_cache(folder)
bh = Buildhub()
results = defaultdict(dict)
if channels is None:
channels = CHANNELS.keys()
for channel in channels:
print("\nRetreiving Buildhub results for channel " + channel)
revision_dates = [
rd
for rd in bh.get_revision_dates(
channel, min_fx_version, max_version=max_fx_version
)
if rd["revision"] not in SKIP_REVISIONS
]
num_revisions = len(revision_dates)
print(" " + str(num_revisions) + " revisions found")
for i, rd in enumerate(revision_dates):
revision = rd["revision"]
print(
(
f" Downloading files for revision number {str(i+1)}/{str(num_revisions)}"
f" - revision: {revision}, tree: {rd['tree']}, version: {str(rd['version'])}"
)
)
version = extract_major_version(rd["version"])
files = download_files(
channel, revision, folder, error_cache, version, tree=rd["tree"]
)
results[channel][revision] = {
"date": rd["date"],
"version": version,
"registries": files,
}
save_error_cache(folder, error_cache)
return results