download.py

#!/usr/bin/env python3 # Fetch data for the given date. # # This also derives the active release version for the date to use in the query. from bisect import bisect_right from datetime import date, datetime, timedelta, timezone from google.cloud import bigquery from pathlib import PurePath import requests import typing RELEASE_ROLLOUT_PERIOD = timedelta(days=2) USER_AGENT = "crash-ping-ingest/1.0 date_version_config" PROJECT = 'moz-fx-data-shared-prod' session = requests.Session() session.headers.update({'User-Agent': USER_AGENT}) def get_release_version(for_date: date) -> int: req = session.get("https://product-details.mozilla.org/1.0/firefox_history_major_releases.json") if not req.ok: raise IndexError # Take advantage of the JSON response having keys always in ascending # version (and date) order by loading the pairs directly into a list. release_versions = req.json(object_pairs_hook = list) # If the release has just switched over, keep the old versions for a little # while. Nightly/Beta/Release dates are sometimes a day apart, but we can # ignore that as this is just an affordance for the release roll out. search_date = for_date - RELEASE_ROLLOUT_PERIOD i = bisect_right([date for _, date in release_versions], str(search_date)) major = int(release_versions[i - 1][0].split('.')[0]) if i > 0 else 0 return major # Returns the config rows and the pings rows def download(query: str, for_date: date, release_version: int) -> tuple[bigquery.table.RowIterator, bigquery.table.RowIterator]: client = bigquery.Client(PROJECT) job_config = bigquery.QueryJobConfig( query_parameters = [ bigquery.ScalarQueryParameter("date", bigquery.SqlParameterScalarTypes.DATE, for_date), bigquery.ScalarQueryParameter("release_version", bigquery.SqlParameterScalarTypes.INT64, release_version), ], ) query_job = client.query(query, job_config = job_config) # Wait for job to complete (to ensure all child jobs are present) query_job.result() child_jobs = client.list_jobs(parent_job = query_job) [pings_job, config_job] = [job for job in child_jobs if job.statement_type == "SELECT"] return config_job.result(), pings_job.result() def write_rows(rows: bigquery.table.RowIterator, output: typing.IO[str]): for row in rows: json.dump(dict(row), output, separators=(',',':')) output.write("\n") if __name__ == "__main__": import argparse import json import sys parser = argparse.ArgumentParser() parser.add_argument('-d', '--date', help="the date for which to fetch data. Defaults to yesterday (UTC)") parser.add_argument('-q', '--query', metavar="FILE", help="""the sql query file to use to fetch the data. It will be passed `date: DATE` and `release_version: INT64` parameters. The query should have two select statements: the first producing configuration and the second producing pings. Defaults to `download.sql`""", default="download.sql") parser.add_argument('-c', '--config', metavar="FILE", help="the file to which to write the config file") parser.add_argument('filename', help="""the jsonl file to write; use '-' for stdout. If `--config` is omitted, the config table will be written to `<filename-stem>-config.jsonl`, or just `config.jsonl` if writing to stdout. """) args = parser.parse_args() for_date = date.fromisoformat(args.date) if args.date else datetime.now(timezone.utc).date() - timedelta(days = 1) release_version = get_release_version(for_date) with open(args.query, 'r') as f: query = f.read() config_output_path = args.config if args.filename == '-': pings_output = sys.stdout if config_output_path is None: config_output_path = 'config.jsonl' else: pings_output = open(args.filename, 'w') if config_output_path is None: path = PurePath(args.filename) config_output_path = str(path.with_stem(path.stem + "-config")) config_output = open(config_output_path, 'w') config, pings = download(query, for_date, release_version) write_rows(config, config_output) write_rows(pings, pings_output)

download.py (67 lines of code) (raw):