def find_bug_fixing_commits()

in scripts/regressor_finder.py [0:0]


    def find_bug_fixing_commits(self) -> None:
        logger.info("Downloading commits database...")
        assert db.download(repository.COMMITS_DB)

        logger.info("Downloading bugs database...")
        assert db.download(bugzilla.BUGS_DB)

        logger.info("Download previous classifications...")
        db.download(BUG_FIXING_COMMITS_DB)

        logger.info("Get previously classified commits...")
        prev_bug_fixing_commits_nodes = set(
            bug_fixing_commit["rev"]
            for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
        )
        logger.info(
            "Already classified %d commits...", len(prev_bug_fixing_commits_nodes)
        )

        # TODO: Switch to the pure Defect model, as it's better in this case.
        logger.info("Downloading defect/enhancement/task model...")
        defect_model = cast(
            DefectEnhancementTaskModel,
            DefectEnhancementTaskModel.load(download_model("defectenhancementtask")),
        )

        logger.info("Downloading regression model...")
        regression_model = cast(
            RegressionModel, RegressionModel.load(download_model("regression"))
        )

        start_date = datetime.now() - RELATIVE_START_DATE
        end_date = datetime.now() - RELATIVE_END_DATE
        logger.info(
            f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
        )
        commit_map = defaultdict(list)
        for commit in repository.get_commits():
            if commit["node"] in prev_bug_fixing_commits_nodes:
                continue

            commit_date = dateutil.parser.parse(commit["pushdate"])
            if commit_date < start_date or commit_date > end_date:
                continue

            commit_map[commit["bug_id"]].append(commit["node"])

        logger.info(
            "%d commits found, %d bugs linked to commits",
            sum(len(commit_list) for commit_list in commit_map.values()),
            len(commit_map),
        )
        assert len(commit_map) > 0

        def get_relevant_bugs() -> Iterator[dict]:
            return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map)

        bug_count = sum(1 for bug in get_relevant_bugs())
        logger.info(
            "%d bugs in total, %d bugs linked to commits missing",
            bug_count,
            len(commit_map) - bug_count,
        )

        known_defect_labels, _ = defect_model.get_labels()
        known_regression_labels, _ = regression_model.get_labels()

        bug_fixing_commits = []
        bugs_to_classify = []

        def append_bug_fixing_commits(bug_id: int, type_: str) -> None:
            for commit in commit_map[bug_id]:
                bug_fixing_commits.append({"rev": commit, "type": type_})

        for bug in tqdm(get_relevant_bugs(), total=bug_count):
            # Ignore bugs which are not linked to the commits we care about.
            if bug["id"] not in commit_map:
                continue

            # If we know the label already, we don't need to apply the model.
            if (
                bug["id"] in known_regression_labels
                and known_regression_labels[bug["id"]] == 1
            ):
                append_bug_fixing_commits(bug["id"], "r")
                continue

            if bug["id"] in known_defect_labels:
                if known_defect_labels[bug["id"]] == "defect":
                    append_bug_fixing_commits(bug["id"], "d")
                else:
                    append_bug_fixing_commits(bug["id"], "e")
                continue

            bugs_to_classify.append(bug)

        classified_bugs = []
        if bugs_to_classify:
            classified_bugs = defect_model.classify(bugs_to_classify)

        defect_bugs = []

        for bug, label in zip(bugs_to_classify, classified_bugs):
            if label == "defect":
                defect_bugs.append(bug)
            else:
                append_bug_fixing_commits(bug["id"], "e")

        classified_defect_bugs = []
        if defect_bugs:
            classified_defect_bugs = regression_model.classify(defect_bugs)

        for bug, classification in zip(defect_bugs, classified_defect_bugs):
            if classification == 1:
                append_bug_fixing_commits(bug["id"], "r")
            else:
                append_bug_fixing_commits(bug["id"], "d")

        db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
        zstd_compress(BUG_FIXING_COMMITS_DB)
        db.upload(BUG_FIXING_COMMITS_DB)