def processRedashDataset()

in crashes.py [0:0]


def processRedashDataset(dbFilename, jsonUrl, queryId, userKey, cacheValue, parameters, crashProcessMax):
  props = list()
  reports = dict()

  totals = {
    'processed': 0,
    'skippedBadSig': 0,
    'alreadyProcessed': 0,
    'outdated': 0
  }
  
  # load up our database of processed crash ids
  # returns an empty dict() if no data is loaded.
  reports, stats = loadReports(dbFilename)

  if LoadLocally:
    with open(LocalJsonFile) as f:
      dataset = json.load(f)
  else:
    with Spinner("loading from redash..."):
      dataset = getRedashQueryResult(jsonUrl, queryId, userKey, cacheValue, parameters)
    print("   done.")

  crashesToProcess = len(dataset["query_result"]["data"]["rows"])
  if  crashesToProcess > crashProcessMax:
    crashesToProcess = crashProcessMax

  print('%04d total reports loaded.' % crashesToProcess)

  for recrow in dataset["query_result"]["data"]["rows"]:
    if totals['processed'] >= crashProcessMax:
      break

    # pull some redash props out of the recrow. You can add these
    # by modifying the sql query.
    operatingSystem = recrow['normalized_os']
    operatingSystemVer = recrow['normalized_os_version']
    firefoxVer = recrow['display_version']
    buildId = recrow['build_id']
    compositor = recrow['compositor']
    arch = recrow['arch']
    oomSize = recrow['oom_size']
    devVendor = recrow['vendor']
    devGen = recrow['gen']
    devChipset = recrow['chipset']
    devDevice = recrow['device']
    drvVer = recrow['driver_version']
    drvDate = recrow['driver_date']
    clientId = recrow['client_id']
    devDesc = recrow['device_description']

    # Load the json crash payload from recrow
    props = json.loads(recrow["payload"])

    # touch up for the crash symbolication package
    props['stackTraces'] = props['stack_traces']

    crashId = props['crash_id']
    crashDate = props['crash_date']
    minidumpHash = props['minidump_sha256_hash']
    crashReason = props['metadata']['moz_crash_reason']
    crashInfo = props['stack_traces']['crash_info']

    startupCrash = False
    if recrow['startup_crash']:
      startupCrash = int(recrow['startup_crash'])

    fissionEnabled = False
    if recrow['fission_enabled']:
      fissionEnabled = int(recrow['fission_enabled'])

    lockdownEnabled = False
    if recrow['lockdown_enabled']:
      lockdownVal = int(recrow['lockdown_enabled'])
      if lockdownVal == 1:
        lockdownEnabled = True

    if crashReason != None:
      crashReason = crashReason.strip('\n')

    # Ignore crashes older than 7 days
    if not checkCrashAge(crashDate):
      totals['processed'] += 1
      totals['outdated'] += 1
      progress(totals['processed'], crashesToProcess)
      continue

    # check if the crash id is processed, if so continue
    ## note, this search has become quite slow. optimize me.
    found = False
    signature = ""
    for sighash in reports: # reports is a dictionary of signature hashes
      for report in reports[sighash]['reportList']: # reportList is a list of dictionaries 
        if report['crashid'] == crashId: # string compare, slow
          found = True
          # if you add a new value to the sql queries, you can update
          # the local json cache we have in memory here. Saves having
          # to delete the file and symbolicate everything again.
          #report['fission'] = fissionEnabled
          #report['lockdown'] = lockdownEnabled
          break

    if found:
      totals['processed'] += 1
      totals['alreadyProcessed'] += 1
      progress(totals['processed'], crashesToProcess)
      continue
  
    # symbolicate and return payload result
    payload = symbolicate({ "normalized_os": operatingSystem, "payload": props })
    signature = generateSignature(payload)

    if skipProcessSignature(signature):
      totals['processed'] += 1
      totals['skippedBadSig'] += 1
      progress(totals['processed'], crashesToProcess)
      continue

    # pull stack information for the crashing thread
    try:
      crashingThreadIndex = payload['crashing_thread']
    except KeyError:
      #print("KeyError on crashing_thread for report");
      continue

    threads = payload['threads']
    
    try:
      frames = threads[crashingThreadIndex]['frames']
    except IndexError:
      print("IndexError while indexing crashing thread");
      continue
    except TypeError:
      print("TypeError while indexing crashing thread");
      continue

    # build up a pretty stack
    stack = processStack(frames)

    # generate a tracking hash 
    hash = generateSignatureHash(signature, operatingSystem, operatingSystemVer, arch, firefoxVer)

    if hash not in reports.keys():
      # Set up this signature's meta data we track in the signature header.
      reports[hash] = {
        'signature':          signature,
        'operatingsystem':    [operatingSystem],
        'osversion':          [operatingSystemVer],
        'firefoxver':         [firefoxVer],
        'arch':               [arch],
        'reportList':         list()
      }

    # Update meta data we track in the report header.
    if operatingSystem not in reports[hash]['operatingsystem']:
      reports[hash]['operatingsystem'].append(operatingSystem)
    if operatingSystemVer not in reports[hash]['osversion']:
      reports[hash]['osversion'].append(operatingSystemVer)
    if firefoxVer not in reports[hash]['firefoxver']:
      reports[hash]['firefoxver'].append(firefoxVer)
    if arch not in reports[hash]['arch']:
      reports[hash]['arch'].append(arch)

    # create our report with per crash meta data
    report = {
      'clientid':           clientId,
      'crashid':            crashId,
      'crashdate':          crashDate,
      'compositor':         compositor,
      'stack':              stack,
      'oomsize':            oomSize,
      'type':               crashInfo['type'],
      'devvendor':          devVendor,
      'devgen':             devGen,
      'devchipset':         devChipset,
      'devdevice':          devDevice,
      'devdescription':     devDesc,
      'driverversion' :     drvVer,
      'driverdate':         drvDate,
      'minidumphash':       minidumpHash,
      'crashreason':        crashReason,
      'startup':            startupCrash,
      'fission':            fissionEnabled,
      'lockdown':           lockdownEnabled,
      # Duplicated but useful if we decide to change the hashing algo
      # and need to reprocess reports.
      'operatingsystem':    operatingSystem,
      'osversion':          operatingSystemVer,
      'firefoxver':         firefoxVer,
      'arch':               arch
    }

    # save this crash in our report list
    reports[hash]['reportList'].append(report)
   
    if hash not in stats.keys():
      stats[hash] = {
        'signature': signature,
        'crashdata': {}
      }

    # check to see if stats has a date entry that matches crashDate
    if crashDate not in stats[hash]['crashdata']:
      stats[hash]['crashdata'][crashDate] = { 'crashids': [], 'clientids':[] }

    if operatingSystem not in stats[hash]['crashdata'][crashDate]:
      stats[hash]['crashdata'][crashDate][operatingSystem] = {}

    if operatingSystemVer not in stats[hash]['crashdata'][crashDate][operatingSystem]:
      stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer] = {}

    if arch not in stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer]:
      stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch] = {}

    if firefoxVer not in stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch]:
      stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer] = { 'clientcount': 0, 'crashcount': 0 }

    if crashId not in stats[hash]['crashdata'][crashDate]['crashids']:
      stats[hash]['crashdata'][crashDate]['crashids'].append(crashId)
      stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer]['crashcount'] += 1
      if clientId not in stats[hash]['crashdata'][crashDate]['clientids']:
        stats[hash]['crashdata'][crashDate][operatingSystem][operatingSystemVer][arch][firefoxVer]['clientcount'] += 1
        stats[hash]['crashdata'][crashDate]['clientids'].append(clientId)

    totals['processed'] += 1

    progress(totals['processed'], crashesToProcess)

  print('\n')
  print('%04d - reports processed' % totals['processed'])
  print('%04d - cached results' % totals['alreadyProcessed'])
  print('%04d - reports skipped, bad signature' % totals['skippedBadSig'])
  print('%04d - reports skipped, out dated' % totals['outdated'])

  # Post processing steps

  # Purge signatures from our reports list that are outdated (based
  # on crash date and version). This keeps our crash lists current,
  # especially after a merge. Note this doesn't clear stats, just reports.
  queryFxVersion = parameters['version']
  purgeOldReports(reports, queryFxVersion)

  # purge old crash and client ids from the stats database.
  cleanupStats(reports, stats)

  # calculate unique client id counts for each signature. These are client counts
  # associated with the current redash query, and apply only to a seven day time
  # window. They are stored in the reports database and displayed in the top crash
  # reports. 
  clientCounts = dict()
  needsUpdate = False
  for hash in reports:
    clientCounts[hash] = list()
    for report in reports[hash]['reportList']:
      clientId = report['clientid']
      if clientId not in clientCounts[hash]:
        clientCounts[hash].append(clientId)
    reports[hash]['clientcount'] = len(clientCounts[hash])

  return reports, stats, totals['processed']