in bugbug/models/bugtype.py [0:0]
def __init__(self, lemmatization=False, historical=False):
BugModel.__init__(self, lemmatization)
self.calculate_importance = False
self.le = LabelBinarizer()
self.bug_type_extractors = bug_features.BugTypes.bug_type_extractors
label_keyword_prefixes = {
keyword
for extractor in self.bug_type_extractors
for keyword in extractor.keyword_prefixes
}
feature_extractors = [
bug_features.HasSTR(),
bug_features.Severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.Keywords(prefixes_to_ignore=label_keyword_prefixes),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
bug_features.HasW3CURL(),
bug_features.HasGithubURL(),
bug_features.Whiteboard(),
bug_features.Patches(),
bug_features.Landings(),
bug_features.BlockedBugsNumber(),
bug_features.EverAffected(),
bug_features.AffectedThenUnaffected(),
bug_features.Product(),
bug_features.Component(),
]
cleanup_functions = [
feature_cleanup.url(),
feature_cleanup.fileref(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor(feature_extractors, cleanup_functions),
),
]
)
self.clf = Pipeline(
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("title", self.text_vectorizer(min_df=0.001), "title"),
(
"first_comment",
self.text_vectorizer(min_df=0.001),
"first_comment",
),
(
"comments",
self.text_vectorizer(min_df=0.001),
"comments",
),
]
),
),
(
"estimator",
OneVsRestClassifier(
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
),
),
]
)