crashclouseau/datacollector.py (317 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from collections import defaultdict
import copy
from datetime import datetime
from dateutil.relativedelta import relativedelta
import functools
from libmozdata import socorro, utils as lmdutils
from libmozdata.connection import Connection, Query
import re
from . import config, models, utils
from .logger import logger
def get_builds(product, channel, date):
"""Get the buildids for a product/channel prior to date"""
if channel == "nightly":
# for nightly, the strategy is pretty simple:
# - just get builds few day before (and update the old one too)
ndays = config.get_ndays()
few_days_ago = date - relativedelta(days=ndays + 5)
few_days_ago = datetime(few_days_ago.year, few_days_ago.month, few_days_ago.day)
search_buildid = [
">=" + utils.get_buildid(few_days_ago),
"<=" + utils.get_buildid(date),
]
search_date = ">=" + lmdutils.get_date_str(few_days_ago)
bids = get_buildids_from_socorro(search_buildid, search_date, product)
else:
bids = []
search_date = ""
min_date = None
data = models.Build.get_last_versions(date, channel, product, n=3)
if data:
# data are ordered by buildid (desc)
bids = [x["buildid"] for x in data]
first_date = utils.get_build_date(bids[-1])
if min_date is None or min_date > first_date:
min_date = first_date
if min_date:
search_date = ">=" + lmdutils.get_date_str(min_date)
return bids, search_date
def get_buildids_from_socorro(search_buildid, search_date, product):
"""Get the builds from socorro for nightly channel.
For other channels we use the database (fed with buildhub data)"""
def handler(json, data):
if json["errors"] or not json["facets"]["build_id"]:
return
for facets in json["facets"]["build_id"]:
bid = facets["term"]
data.append(bid)
params = {
"product": product,
"release_channel": "nightly",
"date": search_date,
"build_id": search_buildid,
"_facets": "build_id",
"_results_number": 0,
"_facets_size": 100,
}
data = []
socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()
data = sorted(data)
return data
def get_new_signatures(product, channel, date):
"""Get the new signatures. In nightly that means that we collect
only signatures with no crashes in last few days"""
limit = config.get_limit_facets()
bids, search_date = get_builds(product, channel, date)
if not bids:
logger.warning("No buildids for {}-{}.".format(product, channel))
return {}
base = {}
for bid in bids:
bid = utils.get_build_date(bid)
day = datetime(bid.year, bid.month, bid.day)
if day not in base:
base[day] = {"installs": {}, "bids": {}, "count": 0}
base[day]["bids"][bid] = 0
logger.info("Get crash numbers for {}-{}: started.".format(product, channel))
def handler(base, json, data):
if json["errors"]:
raise Exception(
"Error in json data from SuperSearch: {}".format(json["errors"])
)
if not json["facets"]["signature"]:
return
for facets in json["facets"]["signature"]:
installs = facets["facets"]["cardinality_install_time"]["value"]
sgn = facets["term"]
bid_info = facets["facets"]["build_id"][0]
count = bid_info["count"]
bid = bid_info["term"]
bid = utils.get_build_date(bid)
day = datetime(bid.year, bid.month, bid.day)
if sgn in data:
numbers = data[sgn]
else:
data[sgn] = numbers = copy.deepcopy(base)
numbers[day]["count"] += count
numbers[day]["bids"][bid] = count
numbers[day]["installs"][bid] = 1 if installs == 0 else installs
del json
params = {
"product": product,
"release_channel": utils.get_search_channel(channel),
"date": search_date,
"build_id": "",
"_aggs.signature": ["build_id", "_cardinality.install_time"],
"_results_number": 0,
"_facets": "release_channel",
"_facets_size": limit,
}
data = {}
hdler = functools.partial(handler, base)
for bid in bids:
params["build_id"] = bid
socorro.SuperSearch(params=params, handler=hdler, handlerdata=data).wait()
shift = config.get_ndays() if channel == "nightly" else 1
threshold = config.get_threshold("installs", product, channel)
big_data = {}
small_data = {}
for sgn, numbers in data.items():
bids, big = utils.get_new_crashing_bids(numbers, shift, threshold)
if bids:
d = {
"bids": bids,
"protos": {b: [] for b in bids},
"installs": {b: 0 for b in bids},
}
if big:
big_data[sgn] = d
else:
small_data[sgn] = d
else:
data[sgn] = None
del data
logger.info("Get crash numbers for {}-{}: finished.".format(product, channel))
if big_data:
get_proto_big(product, big_data, search_date, channel)
if small_data:
get_proto_small(product, small_data, search_date, channel)
small_data.update(big_data)
data = small_data
if product == "Fennec":
# Java crashes don't have any proto-signature...
get_uuids_fennec(data, search_date, channel)
return data
def get_proto_small(product, signatures, search_date, channel):
"""Get the proto-signatures for signature with a small number of crashes.
Since we 'must' aggregate uuid on proto-signatures, to be faster we query
several signatures: it's possible because we know that card(proto) <= card(crashes)
for a given signature."""
logger.info(
"Get proto-signatures (small) for {}-{}: started.".format(product, channel)
)
def handler(bid, threshold, json, data):
if not json["facets"]["proto_signature"]:
return
for facets in json["facets"]["proto_signature"]:
_facets = facets["facets"]
sgn = _facets["signature"][0]["term"]
protos = data[sgn]["protos"][bid]
if len(protos) < threshold:
proto = facets["term"]
count = facets["count"]
uuid = _facets["uuid"][0]["term"]
protos.append({"proto": proto, "count": count, "uuid": uuid})
for facets in json["facets"]["signature"]:
sgn = facets["term"]
count = facets["facets"]["cardinality_install_time"]["value"]
data[sgn]["installs"][bid] = 1 if count == 0 else count
limit = config.get_limit_facets()
threshold = config.get_threshold("protos", product, channel)
base_params = {
"product": product,
"release_channel": utils.get_search_channel(channel),
"date": search_date,
"build_id": "",
"signature": "",
"_aggs.proto_signature": ["uuid", "signature"],
"_aggs.signature": "_cardinality.install_time",
"_results_number": 0,
"_facets": "release_channel",
"_facets_size": limit,
}
sgns_by_bids = utils.get_sgns_by_bids(signatures)
for bid, all_signatures in sgns_by_bids.items():
params = copy.deepcopy(base_params)
params["build_id"] = utils.get_buildid(bid)
queries = []
hdler = functools.partial(handler, bid, threshold)
for sgns in Connection.chunks(all_signatures, 5):
params = copy.deepcopy(params)
params["signature"] = ["=" + s for s in sgns]
queries.append(
Query(
socorro.SuperSearch.URL,
params=params,
handler=hdler,
handlerdata=signatures,
)
)
socorro.SuperSearch(queries=queries).wait()
logger.info(
"Get proto-signatures (small) for {}-{}: finished.".format(product, channel)
)
def get_proto_big(product, signatures, search_date, channel):
"""Get proto-signatures for signatures which have a high # of crashes (>=500)"""
logger.info(
"Get proto-signatures (big) for {}-{}: started.".format(product, channel)
)
def handler(bid, threshold, json, data):
if not json["facets"]["proto_signature"]:
return
installs = json["facets"]["cardinality_install_time"]["value"]
data["installs"][bid] = 1 if installs == 0 else installs
for facets in json["facets"]["proto_signature"]:
protos = data["protos"][bid]
if len(protos) < threshold:
proto = facets["term"]
count = facets["count"]
uuid = facets["facets"]["uuid"][0]["term"]
protos.append({"proto": proto, "count": count, "uuid": uuid})
threshold = config.get_threshold("protos", product, channel)
base_params = {
"product": product,
"release_channel": utils.get_search_channel(channel),
"date": search_date,
"build_id": "",
"signature": "",
"_aggs.proto_signature": "uuid",
"_results_number": 0,
"_facets": "_cardinality.install_time",
"_facets_size": threshold,
}
sgns_by_bids = utils.get_sgns_by_bids(signatures)
for bid, all_signatures in sgns_by_bids.items():
params = copy.deepcopy(base_params)
params["build_id"] = utils.get_buildid(bid)
queries = []
hdler = functools.partial(handler, bid, threshold)
for sgn in all_signatures:
params = copy.deepcopy(params)
params["signature"] = "=" + sgn
queries.append(
Query(
socorro.SuperSearch.URL,
params=params,
handler=hdler,
handlerdata=signatures[sgn],
)
)
socorro.SuperSearch(queries=queries).wait()
logger.info(
"Get proto-signatures (big) for {}-{}: finished.".format(product, channel)
)
def get_uuids_fennec(signatures, search_date, channel):
"""Get the uuids for Fennec java crashes"""
logger.info("Get uuids for Fennec-{}: started.".format(channel))
def handler(json, data):
if json["errors"] or not json["facets"]["signature"]:
return
bid = json["facets"]["build_id"][0]["term"]
bid = utils.get_build_date(bid)
for facets in json["facets"]["signature"]:
sgn = facets["term"]
count = facets["count"]
facets = facets["facets"]
uuid = facets["uuid"][0]["term"]
protos = data[sgn]["protos"][bid]
if not protos:
protos.append({"proto": "", "count": count, "uuid": uuid})
base_params = {
"product": "Fennec",
"release_channel": utils.get_search_channel(channel),
"date": search_date,
"build_id": "",
"signature": "",
"_aggs.signature": "uuid",
"_results_number": 0,
"_facets": "build_id",
"_facets_size": 100,
}
queries = []
sgns_by_bids = utils.get_sgns_by_bids(signatures)
for bid, all_signatures in sgns_by_bids.items():
params = copy.deepcopy(base_params)
params["build_id"] = utils.get_buildid(bid)
for sgns in Connection.chunks(all_signatures, 10):
params = copy.deepcopy(params)
params["signature"] = ["=" + s for s in sgns]
queries.append(
Query(
socorro.SuperSearch.URL,
params=params,
handler=handler,
handlerdata=signatures,
)
)
socorro.SuperSearch(queries=queries).wait()
logger.info("Get uuids for Fennec-{}: finished.".format(channel))
def get_changeset(buildid, channel, product):
"""Trick to get changeset for a particular buildid/channel/product"""
search_date = ">=" + lmdutils.get_date_str(buildid)
buildid = utils.get_buildid(buildid)
logger.info("Get changeset for {}-{}-{}.".format(buildid, product, channel))
def handler(json, data):
pat = re.compile(r"^.*:([0-9a-f]+)$")
if not json["facets"]["build_id"]:
return
for facets in json["facets"]["build_id"]:
for tf in facets["facets"]["topmost_filenames"]:
m = pat.match(tf["term"])
if m:
chgset = m.group(1)
count = tf["count"]
data[chgset] += count
params = {
"product": product,
"release_channel": channel,
"build_id": buildid,
"date": search_date,
"topmost_filenames": '@"hg:hg.mozilla.org/".*:[0-9a-f]+',
"_aggs.build_id": "topmost_filenames",
"_results_number": 0,
"_facets": "product",
"_facets_size": 100,
}
data = defaultdict(lambda: 0)
socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()
chgset = None
if data:
chgset, _ = max(data.items(), key=lambda p: p[1])
chgset = utils.short_rev(chgset)
logger.info("Get changeset: finished.")
return chgset