in scripts/regressor_finder.py [0:0]
def find_bug_fixing_commits(self) -> None:
logger.info("Downloading commits database...")
assert db.download(repository.COMMITS_DB)
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)
logger.info("Download previous classifications...")
db.download(BUG_FIXING_COMMITS_DB)
logger.info("Get previously classified commits...")
prev_bug_fixing_commits_nodes = set(
bug_fixing_commit["rev"]
for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
)
logger.info(
"Already classified %d commits...", len(prev_bug_fixing_commits_nodes)
)
# TODO: Switch to the pure Defect model, as it's better in this case.
logger.info("Downloading defect/enhancement/task model...")
defect_model = cast(
DefectEnhancementTaskModel,
DefectEnhancementTaskModel.load(download_model("defectenhancementtask")),
)
logger.info("Downloading regression model...")
regression_model = cast(
RegressionModel, RegressionModel.load(download_model("regression"))
)
start_date = datetime.now() - RELATIVE_START_DATE
end_date = datetime.now() - RELATIVE_END_DATE
logger.info(
f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
)
commit_map = defaultdict(list)
for commit in repository.get_commits():
if commit["node"] in prev_bug_fixing_commits_nodes:
continue
commit_date = dateutil.parser.parse(commit["pushdate"])
if commit_date < start_date or commit_date > end_date:
continue
commit_map[commit["bug_id"]].append(commit["node"])
logger.info(
"%d commits found, %d bugs linked to commits",
sum(len(commit_list) for commit_list in commit_map.values()),
len(commit_map),
)
assert len(commit_map) > 0
def get_relevant_bugs() -> Iterator[dict]:
return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map)
bug_count = sum(1 for bug in get_relevant_bugs())
logger.info(
"%d bugs in total, %d bugs linked to commits missing",
bug_count,
len(commit_map) - bug_count,
)
known_defect_labels, _ = defect_model.get_labels()
known_regression_labels, _ = regression_model.get_labels()
bug_fixing_commits = []
bugs_to_classify = []
def append_bug_fixing_commits(bug_id: int, type_: str) -> None:
for commit in commit_map[bug_id]:
bug_fixing_commits.append({"rev": commit, "type": type_})
for bug in tqdm(get_relevant_bugs(), total=bug_count):
# Ignore bugs which are not linked to the commits we care about.
if bug["id"] not in commit_map:
continue
# If we know the label already, we don't need to apply the model.
if (
bug["id"] in known_regression_labels
and known_regression_labels[bug["id"]] == 1
):
append_bug_fixing_commits(bug["id"], "r")
continue
if bug["id"] in known_defect_labels:
if known_defect_labels[bug["id"]] == "defect":
append_bug_fixing_commits(bug["id"], "d")
else:
append_bug_fixing_commits(bug["id"], "e")
continue
bugs_to_classify.append(bug)
classified_bugs = []
if bugs_to_classify:
classified_bugs = defect_model.classify(bugs_to_classify)
defect_bugs = []
for bug, label in zip(bugs_to_classify, classified_bugs):
if label == "defect":
defect_bugs.append(bug)
else:
append_bug_fixing_commits(bug["id"], "e")
classified_defect_bugs = []
if defect_bugs:
classified_defect_bugs = regression_model.classify(defect_bugs)
for bug, classification in zip(defect_bugs, classified_defect_bugs):
if classification == 1:
append_bug_fixing_commits(bug["id"], "r")
else:
append_bug_fixing_commits(bug["id"], "d")
db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
zstd_compress(BUG_FIXING_COMMITS_DB)
db.upload(BUG_FIXING_COMMITS_DB)