in bugbug/models/regressor.py [0:0]
def get_labels(self):
classes = {}
if self.use_finder or self.exclude_finder:
if self.finder_regressions_only:
regression_fixes = set(
bug_fixing_commit["rev"]
for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
if bug_fixing_commit["type"] == "r"
)
regressors = set(
r["bug_introducing_rev"]
for r in db.read(BUG_INTRODUCING_COMMITS_DB)
if r["bug_introducing_rev"]
and (
not self.finder_regressions_only
or r["bug_fixing_rev"] in regression_fixes
)
)
regressor_bugs = set(
sum((bug["regressed_by"] for bug in bugzilla.get_bugs()), [])
)
for commit_data in repository.get_commits():
if commit_data["backedoutby"]:
continue
if repository.is_wptsync(commit_data):
continue
push_date = dateutil.parser.parse(commit_data["pushdate"])
# Skip commits used for the evaluation phase.
if push_date > datetime.utcnow() - relativedelta(months=EVALUATION_MONTHS):
continue
node = commit_data["node"]
if commit_data["bug_id"] in regressor_bugs or (
self.use_finder and node in regressors
):
classes[node] = 1
elif not self.exclude_finder or node not in regressors:
# The labels we have are only from two years ago (see https://groups.google.com/g/mozilla.dev.platform/c/SjjW6_O-FqM/m/G-CrIVT2BAAJ).
# While we can go further back with the regressor finder script, it isn't remotely
# as precise as the "Regressed By" data.
# In the future, we might want to re-evaluate this limit (e.g. extend ), but we
# have to be careful (using too old patches might cause worse results as patch
# characteristics evolve over time).
if push_date < datetime.utcnow() - relativedelta(years=2):
continue
# We remove the last 3 months, as there could be regressions which haven't been
# filed yet. While it is true that some regressions might not be found for a long
# time, more than 3 months seems overly conservative.
# There will be some patches we currently add to the clean set and will later move
# to the regressor set, but they are a very small subset.
if push_date > datetime.utcnow() - relativedelta(months=3):
continue
classes[node] = 0
logger.info(
"%d commits caused regressions",
sum(label == 1 for label in classes.values()),
)
logger.info(
"%d commits did not cause regressions",
sum(label == 0 for label in classes.values()),
)
return classes, [0, 1]