probe_scraper/scrapers/moz_central_scraper.py (167 lines of code) (raw):

# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import json import os import re from collections import defaultdict import requests from ..parsers.utils import HTTP_HEADERS from .buildhub import Buildhub BASE_URI = "https://hg.mozilla.org" REGISTRY_FILES = { "histogram": [ "toolkit/components/telemetry/Histograms.json", "dom/base/UseCounters.conf", "dom/base/nsDeprecatedOperationList.h", "servo/components/style/properties/counted_unknown_properties.py", "devtools/shared/css/generated/properties-db.js", ], "scalar": [ "toolkit/components/telemetry/Scalars.yaml", ], "event": [ "toolkit/components/telemetry/Events.yaml", ], } CHANNELS = { "nightly": { "base_uri": f"{BASE_URI}/mozilla-central", "tag_regex": "^FIREFOX_(AURORA|BETA)_[0-9]+_BASE$", "artificial_tags": [ { "date": [1567362726.0, 0], "node": "fd2934cca1ae7b492f29a4d240915aa9ec5b4977", "tag": "FIREFOX_BETA_71_BASE", } ], }, "beta": { "base_uri": f"{BASE_URI}/releases/mozilla-beta", "tag_regex": "^FIREFOX_BETA_[0-9]+_BASE$", }, "release": { "base_uri": f"{BASE_URI}/releases/mozilla-release", "tag_regex": "^FIREFOX_[0-9]+_0_RELEASE$", }, } SKIP_REVISIONS = { "942c201b1ac7a46a449f1fb80da7b050ec0ea120", "1807a36ff99f01abca1c37442fb5b344465bfbdf", "30bdee9799a07b8770719aa868416174ff0c54f5", "9fb70b4ae59336b805a1651e7c57c6385cca0717", "81578db6bf8939678d490b69f0daf4b675027e3a", "b8567457ece9593ddb00344130597698145bdc5c", "c4bdea458a08b975ffd70faed4a2f6fbe1e563bc", "d420f9190e2f35e314aa67ee346650f86451792c", "a680e8cd9618f4afbbb148ad464824cd6ce558d9", "5cbd3d92a78c54b324b6009a25d196adaa8a669b", "75c1403f58f79d1abd43d33fdd1beb36db9367c6", "cafaf813b0a938a197a488e629883770b2d33393", "cbbf6a7e34a363b39107b60dddac2aa713eaa8b5", } MIN_FIREFOX_VERSION = 30 ERROR_CACHE_FILENAME = "probe_scraper_errors_cache.json" ARTIFICIAL_TAG = "artificial" def extract_major_version(version_str): """ Given a version string, e.g. "62.0a1", extract the major version as an int. """ search = re.search(r"^(\d+)\.", version_str) if search is not None: return int(search.group(1)) else: raise Exception("Invalid version string " + version_str) def relative_path_is_in_version(rel_path, version): # The devtools file exists in a bunch of versions, but we only care for it # since firefox 71 (bug 1578661). if ( rel_path == "devtools/shared/css/generated/properties-db.js" or rel_path == "servo/components/style/properties/counted_unknown_properties.py" ): return version >= 71 return True def download_files(channel, node, temp_dir, error_cache, version, tree=None): if tree is None: uri = CHANNELS[channel]["base_uri"] else: # mozilla-release and mozilla-beta need to be prefixed with "release/" # sometimes they aren't from buildhub, add them if they are missing if not tree.startswith("releases/") and tree != "mozilla-central": tree = f"releases/{tree}" uri = f"{BASE_URI}/{tree}" base_uri = f"{uri}/raw-file/{node}/" node_path = os.path.join(temp_dir, "hg", node) results = {} def add_result(ptype, disk_path): if ptype not in results: results[ptype] = [] results[ptype].append(disk_path) all_files = [(k, x) for k, l in list(REGISTRY_FILES.items()) for x in l] for ptype, rel_path in all_files: disk_path = os.path.join(node_path, rel_path) if os.path.exists(disk_path): add_result(ptype, disk_path) continue uri = base_uri + rel_path # requests_cache doesn't cache on error status codes. # We just use our own cache for these for now. if uri in error_cache: continue if not relative_path_is_in_version(rel_path, int(version)): continue req = requests.get(uri, headers=HTTP_HEADERS) if req.status_code != requests.codes.ok: if os.path.basename(rel_path) == "Histograms.json": raise Exception( "Request returned status " + str(req.status_code) + " for " + uri ) else: error_cache[uri] = req.status_code continue dir = os.path.split(disk_path)[0] if not os.path.exists(dir): os.makedirs(dir) with open(disk_path, "wb") as f: for chunk in req.iter_content(chunk_size=128): f.write(chunk) add_result(ptype, disk_path) return results def load_error_cache(folder): path = os.path.join(folder, ERROR_CACHE_FILENAME) if not os.path.exists(path): return {} with open(path, "r") as f: return json.load(f) def save_error_cache(folder, error_cache): path = os.path.join(folder, ERROR_CACHE_FILENAME) with open(path, "w") as f: json.dump(error_cache, f, sort_keys=True, indent=2, separators=(",", ": ")) def scrape_channel_revisions( folder=None, min_fx_version=None, max_fx_version=None, channels=None ): """ Returns data in the format: { <channel>: { <revision>: { "date": <date>, "version": <version>, "registries": { "histogram": [path, ...], "event": [path, ...], "scalar": [path, ...] } } }, ... } """ if min_fx_version is None: min_fx_version = MIN_FIREFOX_VERSION error_cache = load_error_cache(folder) bh = Buildhub() results = defaultdict(dict) if channels is None: channels = CHANNELS.keys() for channel in channels: print("\nRetreiving Buildhub results for channel " + channel) revision_dates = [ rd for rd in bh.get_revision_dates( channel, min_fx_version, max_version=max_fx_version ) if rd["revision"] not in SKIP_REVISIONS ] num_revisions = len(revision_dates) print(" " + str(num_revisions) + " revisions found") for i, rd in enumerate(revision_dates): revision = rd["revision"] print( ( f" Downloading files for revision number {str(i+1)}/{str(num_revisions)}" f" - revision: {revision}, tree: {rd['tree']}, version: {str(rd['version'])}" ) ) version = extract_major_version(rd["version"]) files = download_files( channel, revision, folder, error_cache, version, tree=rd["tree"] ) results[channel][revision] = { "date": rd["date"], "version": version, "registries": files, } save_error_cache(folder, error_cache) return results