bugbug/bug_features.py (635 lines of code) (raw):
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import re
import sys
from collections import defaultdict
from datetime import datetime, timezone
from functools import partial
from multiprocessing.pool import Pool
import pandas as pd
from dateutil import parser
from libmozdata import versions
from libmozdata.bugzilla import Bugzilla
from sklearn.base import BaseEstimator, TransformerMixin
from bugbug import bug_snapshot, bugzilla, repository, utils
utils.setup_libmozdata()
def field(bug, field):
if field in bug and bug[field] not in ("--", "---"):
return bug[field]
return None
class SingleBugFeature(object):
pass
class HasSTR(SingleBugFeature):
name = "Has STR"
def __call__(self, bug, **kwargs):
return field(bug, "cf_has_str")
class HasRegressionRange(SingleBugFeature):
name = "Has Regression Range"
def __call__(self, bug, **kwargs):
return field(bug, "cf_has_regression_range")
class HasCrashSignature(SingleBugFeature):
name = "Crash signature present"
def __call__(self, bug, **kwargs):
return "cf_crash_signature" in bug and bug["cf_crash_signature"] != ""
class Keywords(SingleBugFeature):
def __init__(self, to_ignore=set(), prefixes_to_ignore=set()):
self.to_ignore = to_ignore
self.prefixes_to_ignore = prefixes_to_ignore
def __call__(self, bug, **kwargs):
keywords = []
subkeywords = []
for keyword in bug["keywords"]:
if keyword in self.to_ignore or any(
keyword.startswith(prefix) for prefix in self.prefixes_to_ignore
):
continue
keywords.append(keyword)
if keyword.startswith("sec-"):
subkeywords.append("sec-")
elif keyword.startswith("csectype-"):
subkeywords.append("csectype-")
return keywords + subkeywords
class Severity(SingleBugFeature):
def __call__(self, bug, **kwargs):
return field(bug, "severity")
class NumberOfBugDependencies(SingleBugFeature):
name = "# of bug dependencies"
def __call__(self, bug, **kwargs):
return len(bug["depends_on"])
class IsCoverityIssue(SingleBugFeature):
name = "Is Coverity issue"
def __call__(self, bug, **kwargs):
return (
re.search("[CID ?[0-9]+]", bug["summary"]) is not None
or re.search("[CID ?[0-9]+]", bug["whiteboard"]) is not None
)
class HasURL(SingleBugFeature):
name = "Has a URL"
def __call__(self, bug, **kwargs):
return bug["url"] != ""
class HasW3CURL(SingleBugFeature):
name = "Has a w3c URL"
def __call__(self, bug, **kwargs):
return "w3c" in bug["url"]
class HasGithubURL(SingleBugFeature):
name = "Has a GitHub URL"
def __call__(self, bug, **kwargs):
return "github" in bug["url"]
def whiteboard_keywords(bug):
# Split by '['
paren_splits = bug["whiteboard"].lower().split("[")
# Split splits by space if they weren't in [ and ].
splits = []
for paren_split in paren_splits:
if "]" in paren_split:
paren_split = paren_split.split("]")
splits += paren_split
else:
splits += paren_split.split(" ")
# Remove empty splits and strip
splits = [split.strip() for split in splits if split.strip() != ""]
# For splits which contain ':', return both the whole string and the string before ':'.
splits += [split.split(":", 1)[0] for split in splits if ":" in split]
return splits
class Whiteboard(SingleBugFeature):
def __call__(self, bug, **kwargs):
return whiteboard_keywords(bug)
class Patches(SingleBugFeature):
name = "# of patches"
def __call__(self, bug, **kwargs):
return sum(
1
for a in bug["attachments"]
if a["is_patch"]
or a["content_type"]
in ["text/x-review-board-request", "text/x-phabricator-request"]
)
class Landings(SingleBugFeature):
name = "# of landing comments"
def __call__(self, bug, **kwargs):
return sum(1 for c in bug["comments"] if "://hg.mozilla.org/" in c["text"])
class Product(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["product"]
class Component(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["component"]
class IsMozillian(SingleBugFeature):
name = "Reporter has a @mozilla email"
def __call__(self, bug, **kwargs):
return any(
bug["creator_detail"]["email"].endswith(domain)
for domain in ["@mozilla.com", "@mozilla.org"]
)
class BugReporter(SingleBugFeature):
name = "Bug reporter"
def __call__(self, bug, **kwargs):
return bug["creator_detail"]["email"]
class DeltaRequestMerge(SingleBugFeature):
name = "Timespan between uplift request and following merge"
def __call__(self, bug, **kwargs):
for history in bug["history"]:
for change in history["changes"]:
if change["added"].startswith("approval-mozilla"):
uplift_request_datetime = datetime.strptime(
history["when"], "%Y-%m-%dT%H:%M:%SZ"
).replace(tzinfo=timezone.utc)
timedelta = (
versions.getCloserRelease(uplift_request_datetime)[1]
- uplift_request_datetime
)
return timedelta.days + timedelta.seconds / (24 * 60 * 60)
return None
class DeltaNightlyRequestMerge(SingleBugFeature):
name = "Time delta between landing of the patch in Nightly and uplift request"
def __call__(self, bug, **kwargs):
for history in bug["history"]:
for change in history["changes"]:
if not (
change["added"].startswith("approval-mozilla")
and change["added"].endswith("?")
):
continue
uplift_request_datetime = parser.parse(history["when"])
landing_comments = Bugzilla.get_landing_comments(
bug["comments"], ["nightly"]
)
# This will help us to find the closest landing before the uplift request
landing_time_list = []
for landing in landing_comments:
landing_time = parser.parse(landing["comment"]["creation_time"])
# Only accept if the uplift is on the future and
# if the landing_time is greater than the calculated now
if uplift_request_datetime >= landing_time:
landing_time_list.append(landing_time)
if len(landing_time_list) > 0:
time_delta = uplift_request_datetime - max(landing_time_list)
return time_delta.days + time_delta.seconds / (24 * 60 * 60)
return None
class BlockedBugsNumber(SingleBugFeature):
name = "# of blocked bugs"
def __call__(self, bug, **kwargs):
return len(bug["blocks"])
class Priority(SingleBugFeature):
def __call__(self, bug, **kwargs):
return field(bug, "priority")
class Version(SingleBugFeature):
def __call__(self, bug, **kwargs):
if bug["version"] in ("Default", "Trunk", "trunk"):
return "Trunk"
elif bug["version"] in ("other", "Other Branch"):
return "other"
elif bug["version"] == "unspecified":
return None
else:
return "Has Value"
class TargetMilestone(SingleBugFeature):
def __call__(self, bug, **kwargs):
if bug["target_milestone"] == "Future":
return "Future"
elif bug["target_milestone"] == "---":
return None
else:
return "Has Value"
class HasCVEInAlias(SingleBugFeature):
name = "CVE in alias"
def __call__(self, bug, **kwargs):
return bug["alias"] is not None and "CVE" in bug["alias"]
class CommentCount(SingleBugFeature):
name = "# of comments"
def __call__(self, bug, **kwargs):
return field(bug, "comment_count")
class CommentLength(SingleBugFeature):
name = "Length of comments"
def __call__(self, bug, **kwargs):
return sum(len(x["text"]) for x in bug["comments"])
class ReporterExperience(SingleBugFeature):
name = "# of bugs previously opened by the reporter"
def __call__(self, bug, reporter_experience, **kwargs):
return reporter_experience
class EverAffected(SingleBugFeature):
name = "status has ever been set to 'affected'"
def __call__(self, bug, **kwargs):
for history in bug["history"]:
for change in history["changes"]:
if (
change["field_name"].startswith("cf_status_firefox")
and change["added"] == "affected"
):
return True
return False
def get_versions_statuses(bug):
unaffected = []
affected = []
for key, value in bug.items():
version = None
if key.startswith("cf_status_firefox_esr"):
version = key[len("cf_status_firefox_esr") :]
elif key.startswith("cf_status_firefox"):
version = key[len("cf_status_firefox") :]
if version is None:
continue
if value == "unaffected":
unaffected.append(version)
elif value in [
"affected",
"fixed",
"wontfix",
"fix-optional",
"verified",
"disabled",
"verified disabled",
]:
affected.append(version)
return unaffected, affected
class AffectedThenUnaffected(SingleBugFeature):
name = "status has ever been set to 'affected' and 'unaffected'"
def __call__(self, bug, **kwargs):
unaffected, affected = get_versions_statuses(bug)
return any(
unaffected_ver < affected_ver
for unaffected_ver in unaffected
for affected_ver in affected
)
class NumWordsTitle(SingleBugFeature):
def __call__(self, bug, **kwargs):
return len(bug["summary"].split())
class NumWordsComments(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(len(comment["text"].split()) for comment in bug["comments"])
class HasAttachment(SingleBugFeature):
name = "Attachment present"
def __call__(self, bug, **kwargs):
return len(bug["attachments"]) > 0
class HasImageAttachmentAtBugCreation(SingleBugFeature):
name = "Image attachment present at bug creation"
def __call__(self, bug, **kwargs):
return any(
"image" in attachment["content_type"]
and attachment["creation_time"] == bug["creation_time"]
for attachment in bug["attachments"]
)
class HasImageAttachment(SingleBugFeature):
name = "Image attachment present"
def __call__(self, bug, **kwargs):
return any(
"image" in attachment["content_type"] for attachment in bug["attachments"]
)
class CommitAdded(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(
commit["added"] for commit in bug["commits"] if not commit["backedoutby"]
)
class CommitDeleted(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(
commit["deleted"] for commit in bug["commits"] if not commit["backedoutby"]
)
class CommitTypes(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(
(commit["types"] for commit in bug["commits"] if not commit["backedoutby"]),
[],
)
class CommitFilesModifiedNum(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(
commit["files_modified_num"]
for commit in bug["commits"]
if not commit["backedoutby"]
)
class CommitAuthorExperience(SingleBugFeature):
def __call__(self, bug, **kwargs):
res = [
commit["author_experience"]
for commit in bug["commits"]
if not commit["backedoutby"]
]
return sum(res) / len(res)
class CommitAuthorExperience90Days(SingleBugFeature):
def __call__(self, bug, **kwargs):
res = [
commit["author_experience_90_days"]
for commit in bug["commits"]
if not commit["backedoutby"]
]
return sum(res) / len(res)
class CommitReviewerExperience(SingleBugFeature):
def __call__(self, bug, **kwargs):
res = [
commit["reviewer_experience"]
for commit in bug["commits"]
if not commit["backedoutby"]
]
return sum(res) / len(res)
class CommitReviewerExperience90Days(SingleBugFeature):
def __call__(self, bug, **kwargs):
res = [
commit["reviewer_experience_90_days"]
for commit in bug["commits"]
if not commit["backedoutby"]
]
return sum(res) / len(res)
class CommitNoOfBackouts(SingleBugFeature):
def __call__(self, bug, **kwargs):
return sum(1 for commit in bug["commits"] if commit["backedoutby"])
class ComponentsTouched(SingleBugFeature):
def __call__(self, bug, **kwargs):
return list(
set(
component
for commit in bug["commits"]
for component in commit["components"]
if not commit["backedoutby"]
)
)
class ComponentsTouchedNum(SingleBugFeature):
def __call__(self, bug, **kwargs):
return len(
set(
component
for commit in bug["commits"]
for component in commit["components"]
if not commit["backedoutby"]
)
)
class Platform(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["platform"]
class OpSys(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["op_sys"]
class FiledVia(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["filed_via"]
class IsReporterADeveloper(SingleBugFeature):
def __call__(self, bug, author_ids, **kwargs):
return BugReporter()(bug).strip() in author_ids
class HadSeverityEnhancement(SingleBugFeature):
def __call__(self, bug, **kwargs):
for history in bug["history"]:
for change in history["changes"]:
if (
change["field_name"] == "severity"
and change["added"] == "enhancement"
):
return True
return False
def get_time_to_fix(bug):
if bug["resolution"] != "FIXED":
return None
if bug["cf_last_resolved"] is None:
return None
return (
parser.parse(bug["cf_last_resolved"]) - parser.parse(bug["creation_time"])
).total_seconds() / 86400
class TimeToFix(SingleBugFeature):
def __call__(self, bug, **kwargs):
return get_time_to_fix(bug)
def get_time_to_assign(bug):
for history in bug["history"]:
for change in history["changes"]:
if (
change["field_name"] == "status"
and change["removed"] in ("UNCONFIRMED", "NEW")
and change["added"] == "ASSIGNED"
):
return (
parser.parse(history["when"]) - parser.parse(bug["creation_time"])
).total_seconds() / 86400
return None
class TimeToAssign(SingleBugFeature):
def __call__(self, bug, **kwargs):
return get_time_to_assign(bug)
def get_time_to_close(bug):
"""Calculate the time until closure or the time since closure for a bug."""
if bug["cf_last_resolved"]:
return (
parser.parse(bug["cf_last_resolved"]) - parser.parse(bug["creation_time"])
).total_seconds() / 86400
return (
datetime.now(timezone.utc) - parser.parse(bug["creation_time"])
).total_seconds() / 86400
class TimeToClose(SingleBugFeature):
def __call__(self, bug, **kwargs):
return get_time_to_close(bug)
class CCNumber(SingleBugFeature):
def __call__(self, bug, **kwargs):
return len(bug["cc"])
class IsUplifted(SingleBugFeature):
def __call__(self, bug, **kwargs):
return any(
change["added"].startswith("approval-mozilla")
and change["added"].endswith("+")
for history in bug["history"]
for change in history["changes"]
)
class Resolution(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["resolution"]
class Status(SingleBugFeature):
def __call__(self, bug, **kwargs):
return bug["status"]
def get_author_ids():
author_ids = set()
for commit in repository.get_commits():
author_ids.add(commit["author_email"])
return author_ids
class BugExtractor(BaseEstimator, TransformerMixin):
def __init__(
self,
feature_extractors,
cleanup_functions,
rollback=False,
rollback_when=None,
commit_data=False,
merge_data=True,
):
assert len(set(type(fe) for fe in feature_extractors)) == len(
feature_extractors
), "Duplicate Feature Extractors"
self.feature_extractors = feature_extractors
assert len(set(type(cf) for cf in cleanup_functions)) == len(
cleanup_functions
), "Duplicate Cleanup Functions"
self.cleanup_functions = cleanup_functions
self.rollback = rollback
self.rollback_when = rollback_when
self.commit_data = commit_data
self.merge_data = merge_data
def fit(self, x, y=None):
for feature in self.feature_extractors:
if hasattr(feature, "fit"):
feature.fit(x())
return self
def transform(self, bugs):
bugs_iter = iter(bugs())
reporter_experience_map = defaultdict(int)
author_ids = get_author_ids() if self.commit_data else None
def apply_transform(bug):
data = {}
for feature_extractor in self.feature_extractors:
res = feature_extractor(
bug,
reporter_experience=reporter_experience_map[bug["creator"]],
author_ids=author_ids,
)
if hasattr(feature_extractor, "name"):
feature_extractor_name = feature_extractor.name
else:
feature_extractor_name = feature_extractor.__class__.__name__
if res is None:
continue
if isinstance(res, (list, set)):
for item in res:
data[sys.intern(f"{item} in {feature_extractor_name}")] = True
continue
data[feature_extractor_name] = res
reporter_experience_map[bug["creator"]] += 1
summary = bug["summary"]
comments = [c["text"] for c in bug["comments"]]
for cleanup_function in self.cleanup_functions:
summary = cleanup_function(summary)
comments = [cleanup_function(comment) for comment in comments]
return {
"data": data,
"title": summary,
"first_comment": "" if len(comments) == 0 else comments[0],
"comments": " ".join(comments),
}
def apply_rollback(bugs_iter):
with Pool() as p:
yield from p.imap(
partial(bug_snapshot.rollback, when=self.rollback_when),
bugs_iter,
chunksize=1024,
)
if self.rollback:
bugs_iter = apply_rollback(bugs_iter)
return pd.DataFrame(apply_transform(bug) for bug in bugs_iter)
class IsPerformanceBug(SingleBugFeature):
"""Determine if the bug is related to performance based on given bug data."""
name = "Is Performance Bug"
type_name = "performance"
keyword_prefixes = ("perf", "topperf", "main-thread-io")
whiteboard_prefixes = (
"[fxperf",
"[fxperfsize",
"[snappy",
"[pdfjs-c-performance",
"[pdfjs-performance",
"[sp3",
)
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug.get("cf_performance_impact") in ("low", "medium", "high"):
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsMemoryBug(SingleBugFeature):
"""Determine if the bug is related to memory based on given bug data."""
name = "Is Memory Bug"
type_name = "memory"
keyword_prefixes = ("memory-",)
whiteboard_prefixes = ("[overhead", "[memshrink")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue
alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsPowerBug(SingleBugFeature):
"""Determine if the bug is related to power based on given bug data."""
name = "Is Power Bug"
type_name = "power"
keyword_prefixes = ("power",)
whiteboard_prefixes = ("[power",)
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsSecurityBug(SingleBugFeature):
"""Determine if the bug is related to security based on given bug data."""
name = "Is Security Bug"
type_name = "security"
keyword_prefixes = ("sec-", "csectype-")
whiteboard_prefixes = ("[client-bounty-form", "[sec-survey")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsCrashBug(SingleBugFeature):
"""Determine if the bug is related to crash based on given bug data."""
name = "Is Crash Bug"
type_name = "crash"
keyword_prefixes = ("crash", "crashreportid")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
# Checking for `[@` will exclude some bugs that do not have valid
# signatures: https://mzl.la/46XAqRF
if bug.get("cf_crash_signature") and "[@" in bug["cf_crash_signature"]:
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
return False
class BugTypes(SingleBugFeature):
"""Determine bug type."""
name = "Infer Bug Type"
bug_type_extractors: list = [
IsCrashBug(),
IsMemoryBug(),
IsPerformanceBug(),
IsPowerBug(),
IsSecurityBug(),
]
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> list[str]:
"""Infer bug types based on various bug characteristics.
Args:
- bug (bugzilla.BugDict): A dictionary containing bug data.
- bug_map (Optional[dict[int, bugzilla.BugDict]]): A mapping
of bug IDs to bug dictionaries. Default is None.
Returns:
- list[str]: A list of inferred bug types (e.g., "memory", "power",
"performance", "security", "crash").
"""
return [
is_type.type_name
for is_type in self.bug_type_extractors
if is_type(bug, bug_map)
]
class BugType(SingleBugFeature):
"""Extracts the type of the bug."""
def __call__(self, bug, **kwargs):
return bug["type"]