crashes.py

#!/usr/bin/env python3 import json import hashlib import os import pprint import re import sys import html import getopt import threading import itertools import time import requests import math import string import pygal from string import Template from collections import Counter from urllib.request import urlopen from urllib import request from datetime import datetime, timedelta, date # python -m pip install SomePackage # python.exe -m pip install --upgrade SomePackage # python.exe -m pip install --upgrade fx_crash_sig import fx_crash_sig from fx_crash_sig.crash_processor import CrashProcessor # process types # https://searchfox.org/mozilla-central/source/toolkit/components/crashes/CrashManager.jsm#162 ########################################################### # Usage ########################################################### # -u (url) : redash rest endpoint url # -k (str) : redash user api key # -q (query id) : redash api query id # -c (value) : redash cache value in minutes (0 is the default) # -d (name) : local json cache filename to use (excluding extension) # -n (name) : local html output filename to use (excluding extension) # -c (count) : number of reports to process, overrides the default # -p (k=v) : k=v redash query parameters to pass to the query request. # -z : debugging: load and dump the first few records of the local databases. requires -d. # -s (sig) : search for a token in reports # -a (actor) : IPC actor name to match for ; not passing it will not generate param in query. passing "none" will generate "IS NULL" # -m : Maintenance mode # -l (lower client limit) : set value for ReportLowerClientLimit, filtering out single client crashes (default 2) # python crashes.py -n nightly -d nightly -u https://sql.telemetry.mozilla.org -k (userapikey) -q 79354 -p process_type=gpu -p version=89 -p channel=nightly ## TODO ## stats statistics when loaded or written ## report struct may not need to os, osver, and arch info anymore since we added stats ## signatures that went away feature ## annotation signature keywords ## click handler should ignore clicks if there's selection in the page ## popup panel layout (Fixed By and Notes) is confusing, and wide when it doesn't need to be. ## Remove reliance on version numbers? Need to get signature headers hooked up, and choose the latest releases for main reports ## build id (nightly / beta) ## linux distro information someplace ## clean up the startup crash icons ## better annotations support ## add dates to annotations ## improve signature header information layout, particular fx version numbers. We can easily expand this down and host info similar to crash stats summary pages. ## - filter graphing and the list based on clicks on the header data (version, os, arch) ########################################################### # Globals ########################################################### # The default symbolication server to use. SymbolServerUrl = "https://symbolication.services.mozilla.com/symbolicate/v5" # Max stack depth for symbolication MaxStackDepth = 50 # Signature list length of the resulting top crashes report MostCommonLength = 50 # When generating a report, signatures with crash counts # lower than this value will not be included in the report. MinCrashCount = 1 # Maximum number of crash reports to include for each signature # in the final report. Limits the size of the resulting html. MaxReportCount = 100 # Default redash max_age value in minutes MaxAge = 43200 # Set to True to target a local json file for testing LoadLocally = False LocalJsonFile = "GPU_Raw_Crash_Data_2021_03_19.json" proc = CrashProcessor(MaxStackDepth, SymbolServerUrl) pp = pprint.PrettyPrinter(indent=1, width=260) def symbolicate(ping): try: return proc.symbolicate(ping) except: return None def generateSignature(payload): if payload is None: return "" try: return proc.get_signature_from_symbolicated(payload).signature except: return "" ########################################################### # Progress indicator ########################################################### def progress(count, total, status=''): bar_len = 60 filled_len = int(round(bar_len * count / float(total))) percents = round(100.0 * count / float(total), 1) bar = '=' * filled_len + '-' * (bar_len - filled_len) sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status)) sys.stdout.flush() class Spinner: def __init__(self, message, delay=0.1): self.spinner = itertools.cycle(['-', '/', '|', '\\']) self.delay = delay self.busy = False self.spinner_visible = False sys.stdout.write(message) def write_next(self): with self._screen_lock: if not self.spinner_visible: sys.stdout.write(next(self.spinner)) self.spinner_visible = True sys.stdout.flush() def remove_spinner(self, cleanup=False): with self._screen_lock: if self.spinner_visible: sys.stdout.write('\b') self.spinner_visible = False if cleanup: sys.stdout.write(' ') # overwrite spinner with blank sys.stdout.write('\r') # move to next line sys.stdout.flush() def spinner_task(self): while self.busy: self.write_next() time.sleep(self.delay) self.remove_spinner() def __enter__(self): if sys.stdout.isatty(): self._screen_lock = threading.Lock() self.busy = True self.thread = threading.Thread(target=self.spinner_task) self.thread.start() def __exit__(self, exception, value, tb): if sys.stdout.isatty(): self.busy = False self.remove_spinner(cleanup=True) else: sys.stdout.write('\r') def poll_job(s, redash_url, job): while job['status'] not in (3,4): response = s.get('{}/api/jobs/{}'.format(redash_url, job['id'])) job = response.json()['job'] time.sleep(1) if job['status'] == 3: return job['query_result_id'] return None ########################################################### # Redash queries ########################################################### def getRedashQueryResult(redash_url, query_id, api_key, cacheValue, params): s = requests.Session() s.headers.update({'Authorization': 'Key {}'.format(api_key)}) # max_age is a redash value that controls cached results. If there is a cached query result # newer than this time (in seconds) it will be returned instead of a fresh query. # 86400 = 24 hours, 43200 = 12 hours, 0 = refresh query # # Note sometimes the redash caching feature gets 'stuck' on an old cache. Side effect is # that all reports will eventually be older than 7 days and as such will be filtered out # by this script's age checks in processRedashDataset. Crash lists will shrink to zero # as a result. payload = dict(max_age=cacheValue, parameters=params) url = "%s/api/queries/%s/results" % (redash_url, query_id) response = s.post(url, data=json.dumps(payload)) if response.status_code != 200: print("\nquery error '%s'" % response) pp.pprint(payload) raise Exception('Redash query failed.') #{ 'job': { 'error': '', # 'id': '21429857-5fd0-443d-ba4b-fb9cc6d49add', # 'query_result_id': None, # 'result': None, # 'status': 1, # 'updated_at': 0}} # ...or, we just get back the result try: result = response.json()['job'] except KeyError: return response.json() result_id = poll_job(s, redash_url, response.json()['job']) response = s.get('{}/api/queries/{}/results/{}.json'.format(redash_url, query_id, result_id)) if response.status_code != 200: raise Exception('Failed getting results. (Check your redash query for errors.) statuscode=%d' % response.status_code) return response.json() ########################################################### # HTML and Text Formatting Utilities ########################################################### def escapeBugLinks(text): # convert bug references to links # https://bugzilla.mozilla.org/show_bug.cgi?id=1323439 pattern = "bug ([0-9]*)" replacement = "<a href='https://bugzilla.mozilla.org/show_bug.cgi?id=\\1'>Bug \\1</a>" result = re.sub(pattern, replacement, text, flags=re.IGNORECASE) return result def createBugLink(id): # convert bug references to links return "<a href='https://bugzilla.mozilla.org/show_bug.cgi?id=" + str(id) + "'>bug " + str(id) + "</a>" safe = string.ascii_letters + string.digits + '_-.' def stripWhitespace(text): text = text.strip(' \t\n') return text def stringToHtmlId(s): s = ''.join([letter for letter in s if letter in safe]) return s def generateSourceLink(frame): # examples: # https://hg.mozilla.org/mozilla-central/file/2da6d806f45732e169fd8e7ea9a9761fa7fed93d/netwerk/protocol/http/OpaqueResponseUtils.cpp#l208 # https://crash-stats.mozilla.org/sources/highlight/?url=https://gecko-generated-sources.s3.amazonaws.com/7d3f7c890af...e97be06f948921153/ipc/ipdl/PCompositorManagerParent.cpp&line=200#L-200 # 'file': 's3:gecko-generated-sources:8276fd848664bea270...8e363bdbc972cdb7eb661c4043de93ce27810b54/ipc/ipdl/PWebGLParent.cpp:', # 'file': 'hg:hg.mozilla.org/mozilla-central:dom/canvas/WebGLParent.cpp:52d2c9e672d0a0c50af4d6c93cc0239b9e751d18', # 'line': 59, srcLineNumer = str() srcfileData = str() srcUrl = str() try: srcLineNumber = frame['line'] srcfileData = frame['file'] tokenList = srcfileData.split(':') if (len(tokenList) != 4): print("bad token list " + tokenList) return str() except: return str() if tokenList[0].find('s3') == 0: srcUrl = 'https://crash-stats.mozilla.org/sources/highlight/?url=https://gecko-generated-sources.s3.amazonaws.com/' srcUrl += tokenList[2] srcUrl += '&line=' srcUrl += str(srcLineNumber) srcUrl += '#L-' srcUrl += str(srcLineNumber) elif tokenList[0].find('hg') == 0: srcUrl = 'https://' srcUrl += tokenList[1] srcUrl += '/file/' srcUrl += tokenList[3] srcUrl += '/' srcUrl += tokenList[2] srcUrl += '#l' + str(srcLineNumber) else: #print("Unknown src annoutation source") this happens a lot return str() return srcUrl def escape(text): return html.escape(text) ########################################################### # Crash Report Utilities ########################################################### def processStack(frames): # Normalized function names we can consider the same in calculating # unique reports. We replace the regex match with the key using sub. coelesceFrameDict = { 'RtlUserThreadStart': '[_]+RtlUserThreadStart' } # Functions we can replace with the normalized version, filters # out odd platform parameter differences. coelesceFunctionList = [ 'thread_start<' ] dataStack = list() # [idx] = { 'frame': '(frame)', 'srcUrl': '(url)' } for frame in frames: frameIndex = '?' try: frameIndex = frame['frame'] # zero based frame index except KeyError: continue except TypeError: #print("TypeError while indexing frame."); continue dataStack.insert(frameIndex, { 'index': frameIndex, 'frame': '', 'srcUrl': '', 'module': '' }) functionCall = '' module = 'unknown' offset = 'unknown' try: offset = frame['module_offset'] except: pass try: module = frame['module'] except: pass try: functionCall = frame['function'] except KeyError: dataStack[frameIndex]['frame'] = offset dataStack[frameIndex]['module'] = module continue except TypeError: print("TypeError while indexing function."); dataStack[frameIndex]['frame'] = "(missing function)" continue for k, v in coelesceFrameDict.items(): functionCall = re.sub(v, k, functionCall, 1) break for v in coelesceFunctionList: if re.search(v, functionCall) != None: normalizedFunction = functionCall try: normalizedFunction = frame['normalized'] except KeyError: pass except TypeError: pass functionCall = normalizedFunction break srcUrl = generateSourceLink(frame) dataStack[frameIndex]['srcUrl'] = srcUrl dataStack[frameIndex]['frame'] = functionCall dataStack[frameIndex]['module'] = module return dataStack def generateSignatureHash(signature, os, osVer, arch, fxVer): hashData = signature # Append any crash meta data to our hashData so it applies to uniqueness. # Any variance in this data will cause this signature to be broken out as # a separate signature in the final top crash list. #hashData += os #hashData += osVer #hashData += arch # The redash queries we are currently using target specific versions, so this # doesn't have much of an impact except on beta, where we want to see the effect # of beta fixes that get uplifted. #hashData += fxVer return hashlib.md5(hashData.encode('utf-8')).hexdigest() ########################################################### # Reports data structure utilities ########################################################### def getDatasetStats(reports): sigCount = len(reports) reportCount = 0 for hash in reports: reportCount += len(reports[hash]['reportList']) return sigCount, reportCount def processRedashDataset(dbFilename, jsonUrl, queryId, userKey, cacheValue, parameters, crashProcessMax): props = list() reports = dict() totals = { 'processed': 0, 'skippedBadSig': 0, 'alreadyProcessed': 0, 'outdated': 0 } # load up our database of processed crash ids # returns an empty dict() if no data is loaded. reports, stats = loadReports(dbFilename) if LoadLocally: with open(LocalJsonFile) as f: dataset = json.load(f) else: with Spinner("loading from redash..."): dataset = getRedashQueryResult(jsonUrl, queryId, userKey, cacheValue, parameters) print(" done.") crashesToProcess = len(dataset["query_result"]["data"]["rows"]) if crashesToProcess > crashProcessMax: crashesToProcess = crashProcessMax print('%04d total reports loaded.' % crashesToProcess) for recrow in dataset["query_result"]["data"]["rows"]: if totals['processed'] >= crashProcessMax: break # pull some redash props out of the recrow. You can add these # by modifying the sql query. operatingSystem = recrow['normalized_os'] operatingSystemVer = recrow['normalized_os_version'] firefoxVer = recrow['display_version'] buildId = recrow['build_id'] compositor = recrow['compositor'] arch = recrow['arch'] oomSize = recrow['oom_size'] devVendor = recrow['vendor'] devGen = recrow['gen'] devChipset = recrow['chipset'] devDevice = recrow['device'] drvVer = recrow['driver_version'] drvDate = recrow['driver_date'] clientId = recrow['client_id'] devDesc = recrow['device_description'] # Load the json crash payload from recrow props = json.loads(recrow["payload"]) # touch up for the crash symbolication package props['stackTraces'] = props['stack_traces'] crashId = props['crash_id'] crashDate = props['crash_date'] minidumpHash = props['minidump_sha256_hash'] crashReason = props['metadata']['moz_crash_reason'] crashInfo = props['stack_traces']['crash_info'] startupCrash = False if recrow['startup_crash']: startupCrash = int(recrow['startup_crash']) fissionEnabled = False if recrow['fission_enabled']: fissionEnabled = int(recrow['fission_enabled']) lockdownEnabled = False if recrow['lockdown_enabled']: lockdownVal = int(recrow['lockdown_enabled']) if lockdownVal == 1: lockdownEnabled = True if crashReason != None: crashReason = crashReason.strip('\n') # Ignore crashes older than 7 days if not checkCrashAge(crashDate): totals['processed'] += 1 totals['outdated'] += 1 progress(totals['processed'], crashesToProcess) continue # check if the crash id is processed, if so continue ## note, this search has become quite slow. optimize me. found = False signature = "" for sighash in reports: # reports is a dictionary of signature hashes for report in reports[sighash]['reportList']: # reportList is a list of dictionaries if report['crashid'] == crashId: # string compare, slow found = True # if you add a new value to the sql queries, you can update # the local json cache we have in memory here. Saves having # to delete the file and symbolicate everything again. #report['fission'] = fissionEnabled #report['lockdown'] = lockdownEnabled break if found: totals['processed'] += 1 totals['alreadyProcessed'] += 1 progress(totals['processed'], crashesToProcess) continue # symbolicate and return payload result payload = symbolicate({ "normalized_os": operatingSystem, "payload": props }) signature = generateSignature(payload) if skipProcessSignature(signature): totals['processed'] += 1 totals['skippedBadSig'] += 1 progress(totals['processed'], crashesToProcess) continue # pull stack information for the crashing thread try: crashingThreadIndex = payload['crashing_thread'] except KeyError: #print("KeyError on crashing_thread for report"); continue threads = payload['threads'] try: frames = threads[crashingThreadIndex]['frames'] except IndexError: print("IndexError while indexing crashing thread"); continue except TypeError: print("TypeError while indexing crashing thread"); continue # build up a pretty stack stack = processStack(frames) # generate a tracking hash hash = generateSignatureHash(signature, operatingSystem, operatingSystemVer, arch, firefoxVer) if hash not in reports.keys(): # Set up this signature's meta data we track in the signature header. reports[hash] = { 'signature': signature, 'operatingsystem': [operatingSystem], 'osversion': [operatingSystemVer], 'firefoxver': [firefoxVer], 'arch': [arch], 'reportList': list() } # Update meta data we track in the report header. if operatingSystem not in reports[hash]['operatingsystem']: reports[hash]['operatingsystem'].append(operatingSystem) if operatingSystemVer not in reports[hash]['osversion']: reports[hash]['osversion'].append(operatingSystemVer) if firefoxVer not in reports[hash]['firefoxver']: reports[hash]['firefoxver'].append(firefoxVer) if arch not in reports[hash]['arch']: reports[hash]['arch'].append(arch) # create our report with per crash meta data report = { 'clientid': clientId, 'crashid': crashId, 'crashdate': crashDate, 'compositor': compositor, 'stack': stack, 'oomsize': oomSize, 'type': crashInfo['type'], 'devvendor': devVendor, 'devgen': devGen, 'devchipset': devChipset, 'devdevice': devDevice, 'devdescription': devDesc, 'driverversion' : drvVer, 'driverdate': drvDate, 'minidumphash': minidumpHash, 'crashreason': crashReason, 'startup': startupCrash, 'fission': fissionEnabled, 'lockdown': lockdownEnabled, # Duplicated but useful if we decide to change the hashing algo # and need to reprocess reports. 'operatingsystem': operatingSystem, 'osversion': operatingSystemVer, 'firefoxver': firefoxVer, 'arch': arch } # save this crash in our report list reports[hash]['reportList'].append(report) if hash not in stats.keys(): stats[hash] = { 'signature': signature, 'crashdata': {} } # check to see if stats has a date entry that matches crashDate if crashDate not in stats[hash]['crashdata']: stats[hash]['crashdata'][crashDate] = { 'crashids': [], 'clientids':[] } if operatingSystem not in stats[hash]['crashdata'][crashDate]: stats[hash]['crashdata'][crashDate][operatingSystem] = {} if operatingSystemVer not in stats[hash]['crashdata'][crashDate][operatingSystem]: stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer] = {} if arch not in stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer]: stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch] = {} if firefoxVer not in stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch]: stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer] = { 'clientcount': 0, 'crashcount': 0 } if crashId not in stats[hash]['crashdata'][crashDate]['crashids']: stats[hash]['crashdata'][crashDate]['crashids'].append(crashId) stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer]['crashcount'] += 1 if clientId not in stats[hash]['crashdata'][crashDate]['clientids']: stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer]['clientcount'] += 1 stats[hash]['crashdata'][crashDate]['clientids'].append(clientId) totals['processed'] += 1 progress(totals['processed'], crashesToProcess) print('\n') print('%04d - reports processed' % totals['processed']) print('%04d - cached results' % totals['alreadyProcessed']) print('%04d - reports skipped, bad signature' % totals['skippedBadSig']) print('%04d - reports skipped, out dated' % totals['outdated']) # Post processing steps # Purge signatures from our reports list that are outdated (based # on crash date and version). This keeps our crash lists current, # especially after a merge. Note this doesn't clear stats, just reports. queryFxVersion = parameters['version'] purgeOldReports(reports, queryFxVersion) # purge old crash and client ids from the stats database. cleanupStats(reports, stats) # calculate unique client id counts for each signature. These are client counts # associated with the current redash query, and apply only to a seven day time # window. They are stored in the reports database and displayed in the top crash # reports. clientCounts = dict() needsUpdate = False for hash in reports: clientCounts[hash] = list() for report in reports[hash]['reportList']: clientId = report['clientid'] if clientId not in clientCounts[hash]: clientCounts[hash].append(clientId) reports[hash]['clientcount'] = len(clientCounts[hash]) return reports, stats, totals['processed'] def checkCrashAge(dateStr): try: date = datetime.fromisoformat(dateStr) except: return False oldestDate = datetime.today() - timedelta(days=7) return (date >= oldestDate) def getMainVer(version): return version.split('.')[0] def purgeOldReports(reports, fxVersion): # Purge obsolete reports. # 89.0b7 89.0 90.0.1 totalReportsDropped = 0 for hash in reports: keepRepList = list() origRepLen = len(reports[hash]['reportList']) for report in reports[hash]['reportList']: reportVer = '' try: reportVer = getMainVer(report['firefoxver']) except: pass if fxVersion == reportVer: keepRepList.append(report) totalReportsDropped += (origRepLen - len(keepRepList)) reports[hash]['reportList'] = keepRepList print("Removed %d older reports." % totalReportsDropped) # Purge signatures that have no reports delSigList = list() for hash in reports: newRepList = list() for report in reports[hash]['reportList']: # "crash_date":"2021-03-22" dateStr = report['crashdate'] if checkCrashAge(dateStr): newRepList.append(report) reports[hash]['reportList'] = newRepList if len(newRepList) == 0: # add this signature to our purge list delSigList.append(hash) for hash in reports: if len(reports[hash]['reportList']) == 0: if hash not in delSigList: delSigList.append(hash) # purge old signatures that no longer have reports # associated with them. for hash in delSigList: del reports[hash] print("Removed %d older signatures from our reports database." % len(delSigList)) def cleanupStats(reports, stats): # remove old crash and client ids we no longer have reports for clientList = list() crashList = list() for hash in reports: for report in reports[hash]['reportList']: clientid = report['clientid'] crashid = report['crashid'] if clientid not in clientList: clientList.append(clientid) if crashid not in crashList: crashList.append(crashid) purgeClientIdList = list() purgeCrashIdList = list() for hash in stats: for date in stats[hash]['crashdata'].keys(): for crashid in stats[hash]['crashdata'][date]['crashids']: if crashid not in crashList: if crashid not in purgeCrashIdList: purgeCrashIdList.append(crashid) for clientid in stats[hash]['crashdata'][date]['clientids']: if clientid not in clientList: if clientid not in purgeClientIdList: purgeClientIdList.append(clientid) for crashid in purgeCrashIdList: for hash in stats: for date in stats[hash]['crashdata'].keys(): if crashid in stats[hash]['crashdata'][date]['crashids']: stats[hash]['crashdata'][date]['crashids'].remove(crashid) for clientid in purgeClientIdList: for hash in stats: for date in stats[hash]['crashdata'].keys(): if clientid in stats[hash]['crashdata'][date]['clientids']: stats[hash]['crashdata'][date]['clientids'].remove(clientid) print("Removed %d old client ids and %d old crash ids tracked in stats." % (len(purgeClientIdList), len(purgeCrashIdList))) return True # return true if we should skip processing this signature def skipProcessSignature(signature): if len(signature) == 0: return True elif signature == 'EMPTY: no crashing thread identified': return True elif signature == 'EMPTY: no frame data available': return True elif signature == "<T>": print("sig <T>") return True return False def isFissionRelated(reports): isFission = True for report in reports: try: if report['fission'] == 0: isFission = False except: pass return isFission def isLockdownRelated(reports): isLockdown = True for report in reports: try: if report['lockdown'] == 0: isLockdown = False except: pass return isLockdown def generateTopReportsList(reports): # For certain types of reasons like RustMozCrash, organize # the most common for a report list. Otherwise just dump the # first MaxReportCount. reasonCounter = Counter() for report in reports: crashReason = report['crashreason'] reasonCounter[crashReason] += 1 reportCol = reasonCounter.most_common(MaxReportCount) if len(reportCol) < MaxReportCount: return reports colCount = len(reportCol) maxReasonCount = int(math.ceil(MaxReportCount / colCount)) reportList = list() count = 0 for reason, count in reportCol: for report in reports: if report['crashreason'] == reason: reportList.append(report) count += 1 if count > maxReasonCount: break # next reason return reportList def dumpDatabase(reports, annoFilename): print("= Reports =======================================================================================") pp.pprint(reports) print("= Annotations ===================================================================================") reports = loadAnnotations(annoFilename) pp.pprint(reports) def doMaintenance(dbFilename): exit() # load up our database of processed crash ids reports, stats = loadReports(dbFilename) for hash in reports: signature = reports[hash]['signature'] clientcount = reports[hash]['clientcount'] operatingSystem = reports[hash]['operatingsystem'] del reports[hash]['operatingsystem'] reports[hash]['operatingsystem'] = [operatingSystem] operatingSystemVer = reports[hash]['osversion'] del reports[hash]['osversion'] reports[hash]['osversion'] = [operatingSystemVer] firefoxVer = reports[hash]['firefoxver'] del reports[hash]['firefoxver'] reports[hash]['firefoxver'] = [firefoxVer] arch = reports[hash]['arch'] del reports[hash]['arch'] reports[hash]['arch'] = [arch] #dumpDatabase(reports) # Caching of reports #cacheReports(reports, stats, dbFilename) ########################################################### # File utilities ########################################################### # Load the local report database def loadReports(dbFilename): reportsFile = ("%s-reports.json" % dbFilename) statsFile = ("%s-stats.json" % dbFilename) reports = dict() stats = dict() try: with open(reportsFile) as database: reports = json.load(database) except FileNotFoundError: pass try: with open(statsFile) as database: stats = json.load(database) except FileNotFoundError: pass sigCount, reportCount = getDatasetStats(reports) print("Existing database stats: %d signatures, %d reports." % (sigCount, reportCount)) return reports, stats # Cache the reports database to a local json file. Speeds # up symbolication runs across days by avoid re-symbolicating # reports. def cacheReports(reports, stats, dbFilename): reportsFile = ("%s-reports.json" % dbFilename) statsFile = ("%s-stats.json" % dbFilename) with open(reportsFile, "w") as database: database.write(json.dumps(reports)) with open(statsFile, "w") as database: database.write(json.dumps(stats)) sigCount, reportCount = getDatasetStats(reports) print("Cache database stats: %d signatures, %d reports." % (sigCount, reportCount)) def loadAnnotations(filename): file = "%s.json" % filename try: with open(file) as database: annotations = json.load(database) print("Loading %s annotations file." % file) except FileNotFoundError: print("Could not find %s file." % file) return dict() except json.decoder.JSONDecodeError: print("Json error parsing %s" % file) return dict() return annotations ########################################################### # HTML Template Utilities ########################################################### def extractTemplate(token, srcTemplate): # This returns the inner template from srcTemplate, minus any # identifying tag data. # token would be something like 'signature' used # in identifying tags like: #  #  start = '' end = '' sIndex = srcTemplate.index(start) eIndex = srcTemplate.index(end) if sIndex == -1 or eIndex == -1: raise Exception("Bad HTML template tokens!") template = srcTemplate[sIndex + len(start) : eIndex + len(end)] return template def extractAndTokenizeTemplate(token, srcTemplate, insertToken): # This returns the inner template from srcTemplate, minus any # identifying tag data, and we also return srcTemplate with # $insertToken replacing the block we clipped out. start = '' end = '' sIndex = srcTemplate.index(start) eIndex = srcTemplate.index(end) if sIndex == -1 or eIndex == -1: raise Exception("Bad HTML template tokens!") header = srcTemplate[0:sIndex] footer = srcTemplate[eIndex + len(end):] template = srcTemplate[sIndex + len(start) : eIndex] return template, (header + '$' + insertToken + footer) def dumpTemplates(): print('mainPage -----') print(mainPage) print('outerSigTemplate-----') print(outerSigTemplate) print('outerSigMetaTemplate-----') print(outerSigMetaTemplate) print('outerReportTemplate-----') print(outerReportTemplate) print('outerStackTemplate-----') print(outerStackTemplate) print('innerStackTemplate-----') print(innerStackTemplate) exit() ########################################################### ### Report generation ########################################################### def generateSignatureReport(signature): reports, stats = loadReports() reports = reports[sig] if len(reports) == 0: print("signature not found in database.") exit() #for report in reports: exit() def generateSparklineJS(sigStats, operatingSystems, operatingSystemVers, firefoxVers, archs, className): # generate stats data for crash rate over time graphs # data = [ {name: "Bitcoin", date: "2017-01-01", value: 967.6}, ] #"Windows": { # "6.1": { # "x86": { # "91.0a1": { # "clientcount": 1, # "crashcount": 3 # } # } # } #} rawData = dict() for dateStr in sigStats['crashdata']: for os in operatingSystems: for osver in operatingSystemVers: for arch in archs: for fxver in firefoxVers: try: stats = sigStats['crashdata'][dateStr][os][osver][arch][fxver] rawData[dateStr] = { 'os': os, 'crashcount': stats['crashcount'] } except: pass # some dates may not apply to a particular combination # average data for each os to smooth out the graph # {name: "Windows", date: "2021-06-24", value: 84} # generate a list of dates avgData = dict() dates = list(rawData.keys()) dates.sort() # generate an os list [not used] osList = list() for targetDate in dates: os = rawData[targetDate]['os'] if os not in osList: osList.append(os) # generate plot data plotData = '[' template = '{name: "$name", date: "$date", value: $value},' for targetDate in dates: pd = date.fromisoformat(targetDate) minDate = pd - timedelta(3) maxDate = pd + timedelta(4) crashCount = 0 dataPoints = 0 for tmpDateStr in dates: tmpDate = date.fromisoformat(tmpDateStr) if tmpDate >= minDate and tmpDate <= maxDate: crashCount += rawData[pd.isoformat()]['crashcount'] dataPoints += 1 if dataPoints == 0: avgData[targetDate] = 0 else: avgData[targetDate] = crashCount / dataPoints #print("date:%s cc=%d dp=%d avg=%f" % (targetDate, crashCount, dataPoints, avgData[targetDate])) plotData += Template(template).substitute(name='All', date=targetDate, value=avgData[targetDate]) plotData += ']' #print(plotData) template = 'sparkline(document.querySelector("$cname"), $data, sloptions);' ## sloptions defined in template.html return Template(template).substitute(data=plotData, cname='.' + className) # from list of strings, return a comma separated pretty list def getItemizedHeaderList(theList): result = '' sl = theList.sort() for s in theList: result += s + ', ' return result.strip(' ,') # currently not in use def versionListIsExclusiveTo(version, vList): # 92.0b6 # 92.0.1 # 92.0 # 94.0a1 found = False for v in vList: majorVersionNumber = v.split('.')[0] if version == majorVersionNumber: found = True for v in vList: majorVersionNumber = v.split('.')[0] if version != majorVersionNumber: found = False return found def getFxVersionsFromStatsRec(statsCrashData): result = list() for date in statsCrashData.values(): for opsys in date.values(): if (isinstance(opsys, dict)): for osver in opsys.values(): for arch in osver.values(): for ver in arch.keys(): if ver not in result: result.append(ver) result.sort() return result def getPlatformDataFromStatsRec(statsCrashData): osresult = list() verresult = list() archresult = list() for date in statsCrashData.values(): # accumulate operating system type for opsys in date.keys(): if opsys in ['Windows', 'Linux', 'Mac']: # filter out lists clientids and crashids if opsys not in osresult: osresult.append(opsys) osdict = date[opsys] # accumulate os version values for osver in osdict.keys(): if osver not in verresult: verresult.append(osver) # accumulate arch values osverdict = osdict[osver] for arch in osverdict.keys(): if arch not in archresult: archresult.append(arch) osresult.sort() verresult.sort() archresult.sort() return osresult, verresult, archresult def getSimpVerList(verList): result = list() for ver in verList: simp = ver.split('.', 1)[0] if simp not in result: result.append(simp) return result def testIfNewCrash(statsCrashData, version): verList = getFxVersionsFromStatsRec(statsCrashData) simpList = getSimpVerList(verList) if version in simpList and len(simpList) == 1: return True return False def prettyBetaVersions(verList): verList.sort() betaDict = dict() for s in verList: mver = s.split('.')[0] if mver not in betaDict.keys(): betaDict[mver] = list() try: bver = s.split('b',1)[1] except: bver = 'rc' # RC's '94.0' betaDict[mver].append(bver) result = '' for ver in betaDict.keys(): betaDict[ver].sort() result += ver + ' [' for beta in betaDict[ver]: result += beta + ',' result = result.strip(',') result += '] ' return result def getCommaDelimitedList(theList): result = '' theList.sort() for s in theList: result += s + ', ' return result.strip(' ,') def getPrettyPlatformLists(statsCrashData): opsys, ver, arch = getPlatformDataFromStatsRec(statsCrashData) return getCommaDelimitedList(opsys), getCommaDelimitedList(ver), getCommaDelimitedList(arch) def getPrettyFirefoxVersionList(statsCrashData, channel): verList = getFxVersionsFromStatsRec(statsCrashData) result = '' if channel == 'nightly': verList.sort() for s in verList: result += s.split('.')[0] + ', ' elif channel == 'beta': result = prettyBetaVersions(verList) else: verList.sort() for s in verList: result += s + ', ' return result.strip(' ,') def generateTopCrashReport(reports, stats, totalCrashesProcessed, parameters, ipcActorName, outputFilename, annoFilename, reportLowerClientLimit): processType = parameters['process_type'] channel = parameters['channel'] queryFxVersion = parameters['version'] templateFile = open("template.html", "r") template = templateFile.read() templateFile.close() #  #  innerTemplate, mainPage = extractAndTokenizeTemplate('crash', template, 'main') annotationTemplate, mainPage = extractAndTokenizeTemplate('annotation', mainPage, 'annotations') annotationReport, annotationTemplate = extractAndTokenizeTemplate('annotation report', annotationTemplate, 'annreports') #  #  innerSigTemplate, outerSigTemplate = extractAndTokenizeTemplate('signature', innerTemplate, 'signature') # Main inner block #  #  innerSigMetaTemplate, outerSigMetaTemplate = extractAndTokenizeTemplate('signature meta', innerSigTemplate, 'reports') # Report meta plus stack info #  #  innerReportTemplate, outerReportTemplate = extractAndTokenizeTemplate('report', innerSigMetaTemplate, 'report') #  #  innerStackTemplate, outerStackTemplate = extractAndTokenizeTemplate('stackline', innerReportTemplate, 'stackline') outerStackTemplate = stripWhitespace(outerStackTemplate) innerStackTemplate = stripWhitespace(innerStackTemplate) outerReportTemplate = stripWhitespace(outerReportTemplate) outerSigMetaTemplate = stripWhitespace(outerSigMetaTemplate) outerSigTemplate = stripWhitespace(outerSigTemplate) annotationTemplate = stripWhitespace(annotationTemplate) annotationReport = stripWhitespace(annotationReport) # mainPage = stripWhitespace(mainPage) # mucks with js annDb = loadAnnotations(annoFilename) #resultFile = open(("%s.html" % outputFilename), "w", encoding="utf-8") resultFile = open(("%s.html" % outputFilename), "w", errors="replace") signatureHtml = str() sigMetaHtml = str() annotationsHtml = str() signatureIndex = 0 sigCount, reportCount = getDatasetStats(reports) # generate a top crash list sigCounter = Counter() for hash in reports: if reports[hash]['clientcount'] < reportLowerClientLimit: continue sigCounter[hash] = len(reports[hash]['reportList']) collection = sigCounter.most_common(MostCommonLength) sparklineJS = '' for hash, crashcount in collection: try: sigRecord = reports[hash] # reports data vs. stats except KeyError: continue signature = sigRecord['signature'] statsCrashData = stats[hash]['crashdata'] prettyOperatingSystems, prettyOperatingSystemVers, prettyArchs = getPrettyPlatformLists(statsCrashData) prettyFirefoxVers = getPrettyFirefoxVersionList(statsCrashData, channel) operatingSystemsList, operatingSystemVersList, archsList = getPlatformDataFromStatsRec(statsCrashData) firefoxVersList = getFxVersionsFromStatsRec(statsCrashData) crashcount = len(sigRecord['reportList']) percent = (crashcount / reportCount)*100.0 if crashcount < MinCrashCount: # Skip small crash count reports continue isNewCrash = False newIcon = 'noicon' if testIfNewCrash(statsCrashData, queryFxVersion): isNewCrash = True newIcon = 'icon' signatureIndex += 1 crashStatsHashQuery = 'https://crash-stats.mozilla.org/search/?' crashStatsQuery = 'https://crash-stats.mozilla.org/search/?signature=~%s&product=Firefox&_facets=signature&process_type=%s' % (signature, processType) # sort reports in this signature based on common crash reasons, so the most common # is at the top of the list. reportsToReport = generateTopReportsList(reports[hash]['reportList']) #fissionIcon = 'noicon' #if isFissionRelated(reports[hash]['reportList']): # fissionIcon = 'icon' #if crashcount < 10 and fissionIcon == 'icon': # fissionIcon = 'grayicon' lockdownIcon = 'noicon' if isLockdownRelated(reports[hash]['reportList']): lockdownIcon = 'icon' reportHtml = str() idx = 0 hashTotal= 0 oomIcon = 'noicon' for report in reportsToReport: idx = idx + 1 if idx > MaxReportCount: break oombytes = report['oomsize'] if report['oomsize'] is not None: oomIcon = 'icon' else: oombytes = '' crashReason = report['crashreason'] if (crashReason == None): crashReason = '' crashType = report['type'] crashType = crashType.replace('EXCEPTION_', '') appendAmp = False if hashTotal < 30: # This is all crash stats can hande (414 Request-URI Too Large) try: crashStatsHashQuery += 'minidump_sha256_hash=~' + report['minidumphash'] hashTotal += 1 appendAmp = True except: pass # Redash meta data dump for a particular crash id infoLink = "https://sql.telemetry.mozilla.org/queries/{query_id}?p_channel={channel}&p_process_type={process_type}&p_version={version}&p_crash_id={crash_id}".format(query_id=79462, channel=channel, process_type=processType, version=queryFxVersion, crash_id=report['crashid']) startupStyle = 'noicon' if report['startup'] != 0: startupStyle = 'icon' stackHtml = str() for frameData in report['stack']: # [idx] = { 'index': n, 'frame': '(frame)', 'srcUrl': '(url)', 'module': '(module)' } frameIndex = frameData['index'] frame = frameData['frame'] srcUrl = frameData['srcUrl'] moduleName = frameData['module'] linkStyle = 'inline-block' srcLink = srcUrl if len(srcUrl) == 0: linkStyle = 'none' srcLink = '' stackHtml += Template(innerStackTemplate).substitute(frameindex=frameIndex, frame=escape(frame), srcurl=srcLink, module=moduleName, style=linkStyle) compositor = report['compositor'] if compositor == 'webrender_software_d3d11': compositor = 'd3d11' elif compositor == 'webrender': compositor = 'webrender' elif compositor == 'webrender_software': compositor = 'swiggle' elif compositor == 'none': compositor = '' reportHtml += Template(outerStackTemplate).substitute(expandostack=('st'+str(signatureIndex)+'-'+str(idx)), rindex=idx, type=crashType, oomsize=oombytes, devvendor=report['devvendor'], devgen=report['devgen'], devchipset=report['devchipset'], description=report['devdescription'], drvver=report['driverversion'], drvdate=report['driverdate'], compositor=compositor, reason=crashReason, infolink=infoLink, startupiconclass=startupStyle, stackline=stackHtml) if appendAmp: crashStatsHashQuery += '&' # class="svg-$expandosig" sparklineJS += generateSparklineJS(stats[hash], operatingSystemsList, operatingSystemVersList, firefoxVersList, archsList, 'svg-'+stringToHtmlId(hash)) + '\n' # svg element sigHtml = Template(outerReportTemplate).substitute(expandosig=stringToHtmlId(hash), os=prettyOperatingSystems, fxver=prettyFirefoxVers, osver=prettyOperatingSystemVers, arch=prettyArchs, report=reportHtml) crashStatsHashQuery = crashStatsHashQuery.rstrip('&') searchIconClass = 'icon' if hashTotal == 0: crashStatsHashQuery = '' searchIconClass = 'lticon' # ann$expandosig - view signature meta parameter annIconClass = 'lticon' if signature in annDb: record = annDb[signature] # record['annotations'] { date: 'date', 'annotation': 'notes' } sigAnnotations = str() # record['fixedby'] (list of tables, { date: 'date', 'version': 87, 'bug': 1234567 } for fb in record['fixedby']: sigAnnotations += Template(annotationReport).substitute(annotations=escape(fb['annotation']), fixedbybug=createBugLink(str(fb['bug'])), fixedbyversion=fb['version']) for annotation in record['annotations']: annotation = escape(annotation['annotation']) annotation = escapeBugLinks(annotation) sigAnnotations += Template(annotationReport).substitute(annotations=annotation, fixedbybug='', fixedbyversion='') annotationsHtml += Template(annotationTemplate).substitute(expandosig=('sig'+str(signatureIndex)), annreports=sigAnnotations) annIconClass = 'icon' sigMetaHtml += Template(outerSigMetaTemplate).substitute(rank=signatureIndex, percent=("%.00f%%" % percent), # expandosig=('sig'+str(signatureIndex)), expandosig=stringToHtmlId(hash), annexpandosig=('sig'+str(signatureIndex)), signature=(html.escape(signature)), newicon=newIcon, fissionicon='noicon', lockicon=lockdownIcon, oomicon=oomIcon, iconclass=searchIconClass, anniconclass=annIconClass, cslink=crashStatsHashQuery, cssearchlink=crashStatsQuery, clientcount=sigRecord['clientcount'], count=crashcount, reports=sigHtml) if ipcActorName: ipcActorHdr = '<div class="header-elements">IPC Actor - {}</div>'.format(ipcActorName) else: ipcActorHdr = "" signatureHtml += Template(outerSigTemplate).substitute(channel=channel, version=queryFxVersion, process=processType, sigcount=sigCount, ipcActorHdr=ipcActorHdr, repcount=reportCount, sparkline=sparklineJS, signature=sigMetaHtml) # Add processed date to the footer dateTime = datetime.now().isoformat() processHead = "{}".format(processType.capitalize()) if ipcActorName: processHead += " ({})".format(ipcActorName) resultFile.write(Template(mainPage).substitute(main=signatureHtml, annotations=annotationsHtml, process=processHead, processeddate=dateTime)) resultFile.close() ########################################################### # Process crashes and stacks ########################################################### def main(): # Maximum number of raw crashes to process. This matches # the limit value of re:dash queries. Reduce for testing # purposes. CrashProcessMax = 7500 # When generating a report, signatures with client counts # lower than this value will not be included in the report. ReportLowerClientLimit = 2 # filter out single client crashes queryId = '' userKey = '' targetSignature = '' dbFilename = "crashreports" #.json annoFilename = "annotations" cacheValue = MaxAge parameters = dict() ipcActor = None options, remainder = getopt.getopt(sys.argv[1:], 'c:u:n:d:c:k:q:p:a:s:zml:') for o, a in options: if o == '-u': jsonUrl = a print("data source url: %s" % jsonUrl) elif o == '-n': outputFilename = a print("output filename: %s.html" % outputFilename) elif o == '-c': cacheValue = int(a) elif o == '-d': dbFilename = a print("local cache file: %s.json" % dbFilename) elif o == '-c': CrashProcessMax = int(a) elif o == '-q': queryId = a print("query id: %s" % queryId) elif o == '-k': userKey = a print("user key: ({}) [CLI]".format(len(userKey))) elif o == '-s': targetSignature = a print("target signature: %s" % targetSignature) elif o == '-m': print("calling maintenance function.") doMaintenance(dbFilename) exit() elif o == '-p': param = a.split('=') parameters[param[0]] = param[1] elif o == '-a': ipcActor = a print("IPC actor: %s" % ipcActor) elif o == '-z': reports, stats = loadReports(dbFilename) dumpDatabase(reports) exit() elif o == '-l': ReportLowerClientLimit = int(a) print("ReportLowerClientLimit: %d" % ReportLowerClientLimit) if len(userKey) == 0: userKey = os.getenv("REDASH_API_KEY") if userKey: print("user key: ({}) [ENV]".format(len(userKey))) else: print("No user key; use -k or REDASH_API_KEY") exit() if ipcActor is not None: if ipcActor == "none": parameters["utility_actor_name_op"] = "IS NULL" else: parameters["utility_actor_name_op"] = 'LIKE "%{}%"'.format(ipcActor) if len(userKey) == 0: print("missing user api key.") exit() elif len(queryId) == 0: print("missing query id.") exit() print("redash cache time: %d" % cacheValue) parameters['crashcount'] = str(CrashProcessMax) if len(targetSignature) > 0: print("analyzing '%s'" % targetSignature) generateSignatureReport(targetSignature) exit() # Pull fresh data from redash and process it reports, stats, totalCrashesProcessed = processRedashDataset(dbFilename, jsonUrl, queryId, userKey, cacheValue, parameters, CrashProcessMax) # Caching of reports cacheReports(reports, stats, dbFilename) generateTopCrashReport(reports, stats, totalCrashesProcessed, parameters, ipcActor, outputFilename, annoFilename, ReportLowerClientLimit) exit() if __name__ == "__main__": main()

crashes.py (1,071 lines of code) (raw):