mozetl/symbolication/modules_with_missing_symbols.py (166 lines of code) (raw):

# Migrated from Databricks to run on dataproc # pip install: # boto3==1.16.20 import argparse import os import sys from datetime import datetime, timedelta from urllib.parse import urljoin import boto3 import requests from pyspark.sql import functions, SparkSession # workaround airflow not able to different schedules for tasks in a dag def parse_args(): parser = argparse.ArgumentParser() parser.add_argument( "--run-on-days", nargs="+", type=int, required=True, help="Only run job on given days (0 is sunday)", ) parser.add_argument( "--date", type=datetime.fromisoformat, default=datetime.utcnow(), help="Run date, defaults to current dat", ) return parser.parse_args() args = parse_args() if args.date.isoweekday() % 7 not in args.run_on_days: print( f"Skipping because run date day of week" f" {args.date} is not in {args.run_on_days}" ) sys.exit(0) os.system("git clone https://github.com/marco-c/missing_symbols.git") spark = SparkSession.builder.appName("modules-with-missing-symbols").getOrCreate() known_modules = set( [module[:-4].lower() for module in os.listdir("missing_symbols/known_modules")] ) dataset = ( spark.read.format("bigquery") .option("table", "moz-fx-data-shared-prod.telemetry_derived.socorro_crash_v2") .load() .where( "crash_date >= to_date('{}')".format( (datetime.utcnow() - timedelta(3)).strftime("%Y-%m-%d") ) ) ) modules = ( dataset.filter(dataset["product"] == "Firefox") .select( ["uuid"] + [functions.explode((dataset["json_dump"]["modules"]["list"])).alias("module")] ) .dropDuplicates(["uuid", "module"]) .select(["module"]) .rdd.map(lambda v: v["module"]["element"]) .filter( lambda m: m["missing_symbols"] and m["filename"].lower() not in known_modules and "(deleted)" not in m["filename"] ) .flatMap( lambda m: [((m["filename"], (m["version"], m["debug_id"], m["debug_file"])), 1)] ) .reduceByKey(lambda x, y: x + y) .map(lambda v: (v[0][0], [(v[0][1], v[1])])) .reduceByKey(lambda x, y: x + y) .sortBy(lambda v: sum(count for ver, count in v[1]), ascending=False) .collect() ) print(f"len(modules): {len(modules)}") [(module, sum(count for ver, count in versions)) for module, versions in modules] top_missing = sorted( [ (name, version, count) for name, versions in modules for version, count in versions if count > 70 ], key=lambda m: m[2], reverse=True, ) print(f"len(top_missing): {len(top_missing)}") with open("missing_symbols/firefox_modules.txt", "r") as f: firefox_modules = [m.lower() for m in f.read().split("\n") if m.strip() != ""] with open("missing_symbols/windows_modules.txt", "r") as f: windows_modules = [m.lower() for m in f.read().split("\n") if m.strip() != ""] r = requests.get( "https://product-details.mozilla.org/1.0/firefox_history_major_releases.json" ) firefox_versions = r.json() old_firefox_versions = [] for version, date in firefox_versions.items(): delta = datetime.utcnow() - datetime.strptime(date, "%Y-%m-%d") if abs(delta.days) > 730: old_firefox_versions.append(version[: version.index(".")]) def is_old_firefox_module(module_info): """Returns whether this is considered an old firefox module Our symbols server expires debug information after 2 years. We don't want to be notified of old firefox modules because it's likely they've expired out of our system. :param module_info: some module information structure consisting of (name, (major, minor, rev), count) :returns: true if this is an old firefox module, false if either this isn't a firefox module or we don't have version information """ name, (version, _, _), count = module_info # If this isn't a firefox module or there's no version information (null or # empty string), then it's not considered an old firefox module if name.lower() not in firefox_modules or not version: return False return any(version.startswith(v + ".") for v in old_firefox_versions) top_missing = [m for m in top_missing if not is_old_firefox_module(m)] def are_symbols_available(debug_file, debug_id): if not debug_file or not debug_id: return False url = urljoin( "https://symbols.mozilla.org/", "{}/{}/{}".format( debug_file, debug_id, debug_file if not debug_file.endswith(".pdb") else debug_file[:-3] + "sym", ), ) r = requests.head(url) return r.ok top_missing_with_avail_info = [ (name, version, debug_id, count, are_symbols_available(debug_id, debug_file)) for name, (version, debug_id, debug_file), count in top_missing ] today_date = datetime.today().strftime("%Y-%m-%d") subject = ( f"Weekly report of modules with missing symbols in crash reports: {today_date}" ) body = """ <table style="border-collapse:collapse;"> <tr> <th style="border: 1px solid black;">Name</th> <th style="border: 1px solid black;">Version</th> <th style="border: 1px solid black;">Debug ID</th> <th style="border: 1px solid black;"># of crash reports</th> </tr> """ any_available = False for name, version, debug_id, count, are_available_now in top_missing_with_avail_info: body += "<tr>" body += '<td style="border: 1px solid black;">' if name.lower() in firefox_modules: if debug_id: body += '<span style="color:red;">%s</span>' % name else: body += '<span style="color:orange;">%s</span>' % name elif name.lower() in windows_modules: body += '<span style="color:blue;">%s</span>' % name else: body += name if are_available_now: body += " (*)" any_available = True body += "</td>" body += '<td style="border: 1px solid black;">%s</td>' % version body += '<td style="border: 1px solid black;">%s</td>' % debug_id body += '<td style="border: 1px solid black;">%d</td>' % count body += "</tr>" body += "</table>" body += "<pre>" if any_available: body += """ (*) We now have symbols for the modules marked with an asterisk. We could reprocess them to improve stack traces (and maybe signatures) of some crash reports.\n """ body += """ The number of crash reports refers to the past 3 days. Only modules with at least 2,000 crash reports are shown in this list. Firefox own modules, for which we should have symbols, and have the debug ID are colored in red. For Firefox own modules, where we don't have a debug ID are colored in orange. OS modules, for which we should have symbols, are colored in blue. If you see modules that shouldn't be in this list as it's expected not to have their symbols, either contact mcastelluccio@mozilla.com or open a PR to add them to https://github.com/marco-c/missing_symbols/tree/master/known_modules. """ body += "</pre>" client = boto3.client("ses", region_name="us-west-2") client.send_email( Source="mcastelluccio@data.mozaws.net", Destination={ "ToAddresses": [ "mcastelluccio@mozilla.com", "release-mgmt@mozilla.com", "stability@mozilla.org", ], "CcAddresses": [], }, Message={ "Subject": {"Data": subject, "Charset": "UTF-8"}, "Body": {"Html": {"Data": body, "Charset": "UTF-8"}}, }, )