bugbug/models/stepstoreproduce.py (96 lines of code) (raw):

# -*- coding: utf-8 -*- # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. import logging import xgboost from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline from bugbug import bug_features, bugzilla, feature_cleanup, utils from bugbug.model import BugModel logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class StepsToReproduceModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization) feature_extractors = [ bug_features.HasRegressionRange(), bug_features.Severity(), bug_features.Keywords({"stepswanted"}), bug_features.IsCoverityIssue(), bug_features.HasCrashSignature(), bug_features.HasURL(), bug_features.HasW3CURL(), bug_features.HasGithubURL(), bug_features.Whiteboard(), bug_features.Patches(), bug_features.Landings(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor(feature_extractors, cleanup_functions), ), ] ) self.clf = ImblearnPipeline( [ ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ("sampler", RandomUnderSampler(random_state=0)), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), ), ] ) def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): if bug_data["type"] != "defect": continue if "cf_has_str" in bug_data: if bug_data["cf_has_str"] == "no": classes[int(bug_data["id"])] = 0 elif bug_data["cf_has_str"] == "yes": classes[int(bug_data["id"])] = 1 elif "stepswanted" in bug_data["keywords"]: classes[int(bug_data["id"])] = 0 else: for entry in bug_data["history"]: for change in entry["changes"]: if change["removed"].startswith("stepswanted"): classes[int(bug_data["id"])] = 1 logger.info( "%d bugs have no steps to reproduce", sum(label == 0 for label in classes.values()), ) logger.info( "%d bugs have steps to reproduce", sum(label == 1 for label in classes.values()), ) return classes, [0, 1] def overwrite_classes(self, bugs, classes, probabilities): for i, bug in enumerate(bugs): if "cf_has_str" in bug and bug["cf_has_str"] == "no": classes[i] = 0 if not probabilities else [1.0, 0.0] elif "cf_has_str" in bug and bug["cf_has_str"] == "yes": classes[i] = 1 if not probabilities else [0.0, 1.0] elif "stepswanted" in bug["keywords"]: classes[i] = 0 if not probabilities else [1.0, 0.0] return classes def get_feature_names(self): return self.clf.named_steps["union"].get_feature_names_out()