bugbug/models/devdocneeded.py (104 lines of code) (raw):

# -*- coding: utf-8 -*- # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. import xgboost from imblearn.pipeline import Pipeline as ImblearnPipeline from imblearn.under_sampling import RandomUnderSampler from sklearn.compose import ColumnTransformer from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline from bugbug import bug_features, bugzilla, feature_cleanup, utils from bugbug.model import BugModel class DevDocNeededModel(BugModel): def __init__(self, lemmatization=False): BugModel.__init__(self, lemmatization, commit_data=True) self.cross_validation_enabled = False feature_extractors = [ bug_features.HasSTR(), bug_features.HasRegressionRange(), bug_features.Severity(), bug_features.Keywords({"dev-doc-needed", "dev-doc-complete"}), bug_features.IsCoverityIssue(), bug_features.HasCrashSignature(), bug_features.HasURL(), bug_features.HasW3CURL(), bug_features.HasGithubURL(), bug_features.Whiteboard(), bug_features.Patches(), bug_features.Landings(), bug_features.Product(), bug_features.Component(), bug_features.CommitAdded(), bug_features.CommitDeleted(), bug_features.CommitTypes(), ] cleanup_functions = [ feature_cleanup.fileref(), feature_cleanup.url(), feature_cleanup.synonyms(), ] self.extraction_pipeline = Pipeline( [ ( "bug_extractor", bug_features.BugExtractor( feature_extractors, cleanup_functions, rollback=True, rollback_when=self.rollback, commit_data=True, ), ), ] ) self.clf = ImblearnPipeline( [ ( "union", ColumnTransformer( [ ("data", DictVectorizer(), "data"), ("title", self.text_vectorizer(), "title"), ("comments", self.text_vectorizer(), "comments"), ] ), ), ("sampler", RandomUnderSampler(random_state=0)), ( "estimator", xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), ), ] ) def rollback(self, change): return change["field_name"] == "keywords" and any( keyword in change["added"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ) def get_labels(self): classes = {} for bug_data in bugzilla.get_bugs(): bug_id = int(bug_data["id"]) found_dev_doc = False if any( keyword in bug_data["keywords"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ): classes[bug_id] = 1 found_dev_doc = True if not found_dev_doc: for entry in bug_data["history"]: for change in entry["changes"]: # Bugs that get dev-doc-needed removed from them at some point after it's been added (this suggests a false positive among human-analyzed bugs) if ( change["field_name"] == "keywords" and "dev-doc-needed" in change["removed"] and "dev-doc-complete" not in change["added"] ): classes[bug_id] = 0 # Bugs that go from dev-doc-needed to dev-doc-complete are guaranteed to be good # Bugs that go from not having dev-doc-needed to having dev-doc-complete are bugs # that were missed by previous scans through content but someone realized it # should have been flagged and updated the docs, found the docs already updated. elif change["field_name"] == "keywords" and any( keyword in change["added"] for keyword in ["dev-doc-needed", "dev-doc-complete"] ): classes[bug_id] = 1 if bug_id not in classes: classes[bug_id] = 0 return classes, [0, 1] def get_feature_names(self): return self.clf.named_steps["union"].get_feature_names_out()