bugbug/models/tracking.py (123 lines of code) (raw):
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import xgboost
from imblearn.pipeline import Pipeline as ImblearnPipeline
from imblearn.under_sampling import InstanceHardnessThreshold
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla, feature_cleanup, labels, utils
from bugbug.model import BugModel
class TrackingModel(BugModel):
def __init__(self, lemmatization=False):
BugModel.__init__(self, lemmatization)
self.calculate_importance = False
feature_extractors = [
bug_features.HasSTR(),
bug_features.HasRegressionRange(),
bug_features.Severity(),
bug_features.Keywords(),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
bug_features.HasW3CURL(),
bug_features.HasGithubURL(),
bug_features.Whiteboard(),
bug_features.Patches(),
bug_features.Landings(),
bug_features.Product(),
bug_features.Component(),
bug_features.IsMozillian(),
bug_features.BugReporter(),
bug_features.BlockedBugsNumber(),
bug_features.Priority(),
bug_features.HasCVEInAlias(),
bug_features.CommentCount(),
bug_features.CommentLength(),
bug_features.ReporterExperience(),
bug_features.NumberOfBugDependencies(),
]
cleanup_functions = [
feature_cleanup.url(),
feature_cleanup.fileref(),
feature_cleanup.hex(),
feature_cleanup.dll(),
feature_cleanup.synonyms(),
feature_cleanup.crash(),
]
self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor(
feature_extractors,
cleanup_functions,
rollback=True,
rollback_when=self.rollback,
),
),
]
)
self.clf = ImblearnPipeline(
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("title", self.text_vectorizer(min_df=0.0001), "title"),
(
"comments",
self.text_vectorizer(min_df=0.0001),
"comments",
),
]
),
),
("sampler", InstanceHardnessThreshold(random_state=0)),
(
"estimator",
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
),
]
)
def rollback(self, change):
return change["field_name"].startswith("cf_tracking_firefox")
def get_labels(self):
classes = {}
for bug_id, category in labels.get_labels("tracking"):
assert category in ["True", "False"], f"unexpected category {category}"
classes[int(bug_id)] = 1 if category == "True" else 0
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data["id"])
flag_found = False
tracking_flags = [
flag
for flag in bug_data.keys()
if flag.startswith("cf_tracking_firefox")
]
for tracking_flag in tracking_flags:
if bug_data[tracking_flag] in ["blocking", "+"]:
classes[bug_id] = 1
flag_found = True
elif bug_data[tracking_flag] == "-":
classes[bug_id] = 0
flag_found = True
if not flag_found:
for entry in bug_data["history"]:
for change in entry["changes"]:
if change["field_name"].startswith("cf_tracking_firefox"):
if change["added"] in ["blocking", "+"]:
classes[bug_id] = 1
elif change["added"] == "-":
classes[bug_id] = 0
if bug_data["resolution"] in ["INVALID", "DUPLICATE"]:
continue
if bug_id not in classes:
classes[bug_id] = 0
return classes, [0, 1]
def get_feature_names(self):
return self.clf.named_steps["union"].get_feature_names_out()
def overwrite_classes(self, bugs, classes, probabilities):
for i, bug in enumerate(bugs):
if bug["resolution"] in ["INVALID", "DUPLICATE"]:
classes[i] = 0 if not probabilities else [1.0, 0.0]
return classes