treeherder/intermittents_commenter/commenter.py (442 lines of code) (raw):

import logging import re import time from dataclasses import dataclass, field from datetime import date, datetime, timedelta import requests from django.conf import settings from django.db.models import Count from jinja2 import Template from requests.exceptions import RequestException from treeherder.intermittents_commenter.constants import ( COMPONENTS, WHITEBOARD_NEEDSWORK_OWNER, ) from treeherder.model.models import BugJobMap, Bugscache, OptionCollection from . import fetch logger = logging.getLogger(__name__) @dataclass class BugsDetailsPerPlatform: total: int = 0 per_build_type: dict[str, int] = field( default_factory=dict ) # {build_type1: 2, build_type2: 1, ...} @dataclass class BugRunInfo: platform: str = "" arch: str = "" os_name: str = "" os_version: str = "" build_type: str = "" current_variant: str = "" variants: set[str] = field(default_factory=set) @dataclass class BugsDetails: total: int = 0 test_variants: set = field(default_factory=set) per_repositories: dict[str, int] = field(default_factory=dict) # {repo1: 1, repo2: 2, ...} data_table: dict[str, dict[str, int]] = field( default_factory=dict ) # {variant1: {platform_and_build1: 3, platform_and_build2: 1}, ...} class Commenter: """Handles fetching, composing and submitting bug comments based on daily or weekly thresholds and date range, and updating whiteboard and priority status as need; if in dry_run, comments will be output to stdout rather than submitting to bugzilla.""" test_variants = None manifests = None testrun_matrix = None def __init__(self, weekly_mode, dry_run=False): self.weekly_mode = weekly_mode self.dry_run = dry_run self.session = self.new_request() def run(self): startday, endday = self.calculate_date_strings(self.weekly_mode, 6) alt_startday, alt_endday = self.calculate_date_strings(True, 21) all_bug_changes = self.generate_bug_changes(startday, endday, alt_startday, alt_endday) self.print_or_submit_changes(all_bug_changes) def generate_bug_changes(self, startday, endday, alt_startday, alt_endday): """Returns a list of dicts containing a bug id, a bug comment (only for bugs whose total number of daily or weekly occurrences meet the appropriate threshold) and potentially an updated whiteboard or priority status.""" bug_ids, bugs = self.get_bugs(startday, endday) option_collection_map = OptionCollection.objects.get_option_collection_map() bug_map = self.build_bug_map(bugs, option_collection_map) alt_date_bug_totals = self.get_alt_date_bug_totals(alt_startday, alt_endday, bug_ids) # if fetch_bug_details fails, None is returned bugs_info = self.fetch_all_bug_details(bug_ids) all_bug_changes = [] with open("treeherder/intermittents_commenter/comment.template") as template_file: template = Template(template_file.read()) top_bugs = [] if self.weekly_mode: top_bugs = [ bug[0] for bug in sorted(bug_map.items(), key=lambda x: x[1].total, reverse=True) ][:50] for bug_id, counts in bug_map.items(): change_priority = None change_whiteboard = None priority = 0 rank = top_bugs.index(bug_id) + 1 if self.weekly_mode and bug_id in top_bugs else None if bugs_info and bug_id in bugs_info: if self.weekly_mode: priority = self.assign_priority(counts) if priority == 2: change_priority, change_whiteboard = self.check_needswork_owner( bugs_info[bug_id] ) # change [stockwell needswork] to [stockwell unknown] when failures drop below 20 failures/week # if this block is true, it implies a priority of 0 (mutually exclusive to previous block) if counts.total < 20: change_whiteboard = self.check_needswork(bugs_info[bug_id]["whiteboard"]) else: change_priority, change_whiteboard = self.check_needswork_owner( bugs_info[bug_id] ) # recommend disabling when more than 150 failures tracked over 21 days and # takes precedence over any prevous change_whiteboard assignments if bug_id in alt_date_bug_totals and not self.check_whiteboard_status( bugs_info[bug_id]["whiteboard"] ): priority = 3 change_whiteboard = bugs_info[bug_id]["whiteboard"].replace( "[stockwell unknown]", "" ) change_whiteboard = re.sub( r"\s*\[stockwell needswork[^\]]*\]\s*", "", change_whiteboard ).strip() change_whiteboard += "[stockwell disable-recommended]" comment = template.render( bug_id=bug_id, total=counts.total, rank=rank, priority=priority, repositories=counts.per_repositories, test_variants=sorted(list(counts.test_variants)), data_table=counts.data_table, startday=startday, endday=endday.split()[0], weekly_mode=self.weekly_mode, ) bug_changes = {"bug_id": bug_id, "changes": {"comment": {"body": comment}}} if change_whiteboard: bug_changes["changes"]["whiteboard"] = change_whiteboard if change_priority: bug_changes["changes"]["priority"] = change_priority all_bug_changes.append(bug_changes) return all_bug_changes def check_needswork_owner(self, bug_info): change_priority = None change_whiteboard = None if ( [bug_info["product"], bug_info["component"]] in COMPONENTS ) and not self.check_whiteboard_status(bug_info["whiteboard"]): if bug_info["priority"] not in ["--", "P1", "P2", "P3"]: change_priority = "--" stockwell_labels = re.findall(r"(\[stockwell .+?\])", bug_info["whiteboard"]) # update whiteboard text unless it already contains WHITEBOARD_NEEDSWORK_OWNER if WHITEBOARD_NEEDSWORK_OWNER not in stockwell_labels: change_whiteboard = bug_info["whiteboard"] + WHITEBOARD_NEEDSWORK_OWNER return change_priority, change_whiteboard def check_needswork(self, whiteboard): stockwell_labels = re.findall(r"\[stockwell needswork[^\]]*\]", whiteboard) if len(stockwell_labels) == 0: return None # update all [stockwell needswork] bugs (including all 'needswork' possibilities, # ie 'needswork:owner') and update whiteboard to [stockwell unknown] change_whiteboard = re.sub(r"\s*\[stockwell needswork[^\]]*\]\s*", "", whiteboard).strip() return change_whiteboard + "[stockwell unknown]" def assign_priority(self, counts): priority = 0 if counts.total >= 75: priority = 1 elif counts.total >= 30: priority = 2 return priority def print_or_submit_changes(self, all_bug_changes): for bug in all_bug_changes: if self.dry_run: logger.info("\n" + bug["changes"]["comment"]["body"] + "\n") elif settings.COMMENTER_API_KEY is None: # prevent duplicate comments when on stage/dev pass else: self.submit_bug_changes(bug["changes"], bug["bug_id"]) # sleep between comment submissions to avoid overwhelming servers time.sleep(0.5) logger.warning( "There were {} comments for this {} task.".format( len(all_bug_changes), "weekly" if self.weekly_mode else "daily" ) ) def calculate_date_strings(self, mode, num_days): """Returns a tuple of start (in YYYY-MM-DD format) and end date strings (in YYYY-MM-DD HH:MM:SS format for an inclusive day).""" yesterday = date.today() - timedelta(days=1) endday = datetime(yesterday.year, yesterday.month, yesterday.day, 23, 59, 59, 999) if mode: startday = yesterday - timedelta(days=num_days) else: # daily mode startday = yesterday return startday.isoformat(), endday.strftime("%Y-%m-%d %H:%M:%S.%f") def check_whiteboard_status(self, whiteboard): """Extracts stockwell text from a bug's whiteboard status to determine whether it matches specified stockwell text; returns a boolean.""" stockwell_text = re.search(r"\[stockwell (.+?)\]", whiteboard) if stockwell_text is not None: text = stockwell_text.group(1).split(":")[0] if text == "fixed" or text == "infra" or "disable" in text: return True return False def new_request(self): session = requests.Session() # Use a custom HTTP adapter, so we can set a non-zero max_retries value. session.mount("https://", requests.adapters.HTTPAdapter(max_retries=3)) session.headers = { "User-Agent": f"treeherder/{settings.SITE_HOSTNAME}", "x-bugzilla-api-key": settings.COMMENTER_API_KEY, "Accept": "application/json", } return session def fetch_bug_details(self, bug_ids): """Fetches bug metadata from bugzilla and returns an encoded dict if successful, otherwise returns None.""" params = {"include_fields": "product, component, priority, whiteboard, id"} params["id"] = bug_ids try: response = self.session.get( settings.BZ_API_URL + "/rest/bug", headers=self.session.headers, params=params, timeout=30, ) response.raise_for_status() except RequestException as e: logger.warning(f"error fetching bugzilla metadata for bugs due to {e}") return None if response.headers["Content-Type"] == "text/html; charset=UTF-8": return None data = response.json() if "bugs" not in data: return None return data["bugs"] def submit_bug_changes(self, changes, bug_id): url = f"{settings.BZ_API_URL}/rest/bug/{str(bug_id)}" try: response = self.session.put(url, headers=self.session.headers, json=changes, timeout=30) response.raise_for_status() except RequestException as e: logger.error(f"error posting comment to bugzilla for bug {bug_id} due to {e}") def get_bugs(self, startday, endday): """Get all intermittent failures per specified date range and repository,""" # Min required failures per bug in order to post a comment threshold = 1 if self.weekly_mode else 15 bug_ids = ( BugJobMap.failures.by_date(startday, endday) .filter(bug__bugzilla_id__isnull=False) .values("bug__bugzilla_id") .annotate(total=Count("bug__bugzilla_id")) .filter(total__gte=threshold) .values_list("bug__bugzilla_id", flat=True) ) bugs = ( BugJobMap.failures.by_date(startday, endday) .filter(bug__bugzilla_id__in=bug_ids) .order_by("job__machine_platform__platform") .values( "job__repository__name", "job__machine_platform__platform", "job__machine_platform__architecture", "job__machine_platform__os_name", "bug__bugzilla_id", "job__option_collection_hash", "job__signature__job_type_name", ) ) return bug_ids, bugs def get_test_variant(self, test_suite): test_variants = ( fetch.fetch_test_variants() if self.test_variants is None else self.test_variants ) self.test_variants = test_variants # iterate through variants, allow for Base-[variant_list] variant_symbols = sorted( [ test_variants[v]["suffix"] for v in test_variants if test_variants[v].get("suffix", "") ], key=len, reverse=True, ) # strip known variants # build a list of known variants base_symbol = test_suite found_variants = [] for variant in variant_symbols: if f"-{variant}-" in base_symbol or base_symbol.endswith(variant): found_variants.append(variant) base_symbol = base_symbol.replace(f"-{variant}", "") if not found_variants: return "no_variant" return "-".join(found_variants) def get_all_test_variants(self, bug_run_info, testrun_os_matrix): """ Try to provide a mapping between the artifact giving the manifest and the data available in treeherder. TODO: not very consistant """ variants = set() for key_version in testrun_os_matrix: if key_version.replace(".", "") in bug_run_info.os_version: for key_arch in testrun_os_matrix[key_version]: variants = set(testrun_os_matrix[key_version][key_arch].keys()) variants.add("no_variant") # this is an assumption, we might not always have this return variants def get_bug_run_info(self, bug): all_platforms = ["linux", "mac", "windows", "android"] info = BugRunInfo() raw_data = bug["job__signature__job_type_name"] # platform, os, version info.platform = "linux" info.os_name = "linux" for substr in raw_data.split("-"): if any((current_platform := platform) in substr for platform in all_platforms): info.platform = substr info.os_name = current_platform info.os_version = substr.replace(info.os_name, "") break # architecture info.arch = "x86" if "-64" in raw_data: info.arch = "x86_64" elif "-aarch64" in raw_data: info.arch = "aarch64" # variant info.current_variant = self.get_test_variant(raw_data) info.variants.add(info.current_variant) # build_type # build types can be asan/opt, etc., # so make sure that we search for 'debug' and 'opt' after other build_types build_types = ["asan", "tsan", "ccov", "debug", "opt"] for b_type in build_types: if b_type in raw_data: info.build_type = b_type break if not info.build_type: info.build_type = "unknown build" return info def build_bug_map(self, bugs, option_collection_map): """Build bug_map eg: { "1206327": { "total": 5, "per_repository": { "fx-team": 2, "autoland": 3 }, "test_variants": {'no-variant', 'swr', ...}, "data_table": { "windows10-64/ccov": { "mochitest-browser-chrome": 0, "mochitest-browser-chrome-swr": 2, }, "windows10-64/debug": { "mochitest-browser-chrome-swr": 2, }, "osx-10-10/debug": { "mochitest-browser-chrome": 2, "mochitest-browser-chrome-swr": 0, }, }, }, } """ bug_map = {} bug_ids = [b["bug__bugzilla_id"] for b in bugs] bug_summaries = Bugscache.objects.filter(bugzilla_id__in=bug_ids).values( "summary", "bugzilla_id" ) all_variants = set() for bug, bug_id in zip(bugs, bug_ids): manifest = self.get_test_manifest(bug_summaries.filter(bugzilla_id=bug_id)) bug_testrun_matrix = [] if manifest: testrun_matrix = ( fetch.fetch_testrun_matrix() if self.testrun_matrix is None else self.testrun_matrix ) self.testrun_matrix = testrun_matrix bug_testrun_matrix = testrun_matrix[manifest] bug_run_info = self.get_bug_run_info(bug) all_variants = bug_run_info.variants if bug_testrun_matrix and bug_run_info.os_name in bug_testrun_matrix: testrun_os_matrix = bug_testrun_matrix[bug_run_info.os_name] all_variants |= self.get_all_test_variants(bug_run_info, testrun_os_matrix) repo = bug["job__repository__name"] test_variant = bug_run_info.current_variant if bug_run_info.arch: platform_and_build = ( f"{bug_run_info.platform}-{bug_run_info.arch}/{bug_run_info.build_type}" ) else: platform_and_build = f"{bug_run_info.platform}/{bug_run_info.build_type}" if bug_id not in bug_map: bug_infos = BugsDetails() bug_infos.total = 1 bug_infos.test_variants |= all_variants bug_infos.per_repositories[repo] = 1 bug_infos.data_table[platform_and_build] = {test_variant: 1} bug_map[bug_id] = bug_infos else: bug_infos = bug_map[bug_id] bug_infos.total += 1 bug_infos.test_variants |= all_variants bug_infos.per_repositories.setdefault(repo, 0) bug_infos.per_repositories[repo] += 1 # data_table data_table = bug_infos.data_table platform_and_build_data = data_table.get(platform_and_build, {}) data_table[platform_and_build] = platform_and_build_data data_table[platform_and_build][test_variant] = ( platform_and_build_data.get(test_variant, 0) + 1 ) return bug_map def get_alt_date_bug_totals(self, startday, endday, bug_ids): """use previously fetched bug_ids to check for total failures exceeding 150 in 21 days""" bugs = ( BugJobMap.failures.by_date(startday, endday) .filter(bug_id__in=bug_ids) .values("bug_id") .annotate(total=Count("id")) .values("bug_id", "total") ) return {bug["bug_id"]: bug["total"] for bug in bugs if bug["total"] >= 150} def fetch_all_bug_details(self, bug_ids): """batch requests for bugzilla data in groups of 1200 (which is the safe limit for not hitting the max url length)""" min = 0 max = 600 bugs_list = [] bug_ids_length = len(bug_ids) while bug_ids_length >= min and bug_ids_length > 0: data = self.fetch_bug_details(bug_ids[min:max]) if data: bugs_list += data min = max max = max + 600 return {bug["id"]: bug for bug in bugs_list} if len(bugs_list) else None def get_tests_from_manifests(self): manifests = fetch.fetch_test_manifests() if self.manifests is None else self.manifests self.manifests = manifests all_tests = {} for component in manifests["tests"]: for item in manifests["tests"][component]: if item["test"] not in all_tests: all_tests[item["test"]] = [] # split(':') allows for parent:child where we want to keep parent all_tests[item["test"]].append(item["manifest"][0].split(":")[0]) return all_tests def fix_wpt_name(self, test_name): # TODO: keep this updated with wpt changes to: # https://searchfox.org/mozilla-central/source/testing/web-platform/tests/tools/serve/serve.py#273 if ( ".https.any.shadowrealm-in-serviceworker.html" in test_name or ".https.any.shadowrealm-in-audioworklet.html" in test_name ): test_name = f"{test_name.split('.https.any.')[0]}.any.js" elif ".any." in test_name: test_name = f"{test_name.split('.any.')[0]}.any.js" if ".window.html" in test_name: test_name = test_name.replace(".window.html", ".window.js") if ".worker.html" in test_name: test_name = test_name.replace(".worker.html", ".worker.js") if test_name.startswith("/mozilla/tests"): test_name = test_name.replace("/mozilla/", "mozilla/") if test_name.startswith("mozilla/tests"): test_name = f"testing/web-platform/{test_name}" else: test_name = "testing/web-platform/tests/" + test_name.strip("/") # some wpt tests have params, those are not supported test_name = test_name.split("?")[0] return test_name def get_test_manifest(self, bug_summaries): all_tests = self.get_tests_from_manifests() tv_strings = [ " TV ", " TV-nofis ", "[TV]", " TVW ", "[TVW]", " TC ", "[TC]", " TCW ", "[TCW]", ] test_file_extensions = [ "html", "html (finished)", "js", "js (finished)", "py", "htm", "xht", "svg", "mp4", ] for bug_summary_dict in bug_summaries: summary = bug_summary_dict["summary"] # ensure format we want if "| single tracking bug" not in summary: continue # ignore chrome://, file://, resource://, http[s]://, etc. if "://" in summary: continue # ignore test-verify as these run only on demand when the specific test is modified if any(k for k in tv_strings if k.lower() in summary.lower()): continue # now parse and try to find file in list of tests if any(k for k in test_file_extensions if f"{k} | single" in summary): if " (finished)" in summary: summary = summary.replace(" (finished)", "") # get <test_name> from: "TEST-UNEXPECTED-FAIL | <test_name> | single tracking bug" # TODO: fix reftest test_name = summary.split("|")[-2].strip() if " == " in test_name or " != " in test_name: test_name = test_name.split(" ")[0] else: test_name = test_name.split(" ")[-1] # comm/ is thunderbird, not in mozilla-central repo # "-ref" is related to a reftest reference file, not what we want to target # if no <path>/<filename>, then we won't be able to find in repo, ignore if test_name.startswith("comm/") or "-ref" in test_name or "/" not in test_name: continue # handle known WPT mapping if test_name.startswith("/") or test_name.startswith("mozilla/tests"): test_name = self.fix_wpt_name(test_name) if test_name not in all_tests: # try reftest: if f"layout/reftests/{test_name}" in all_tests: test_name = f"layout/reftests/{test_name}" else: # unknown test # TODO: we get here for a few reasons: # 1) test has moved in the source tree # 2) test has typo in summary # 3) test has been deleted from the source tree # 4) sometimes test was deleted but is valid on beta continue # matching test- we can access manifest manifest = all_tests[test_name] return manifest[0] return None