def find_bug_introducing_commits()

in scripts/regressor_finder.py [0:0]


    def find_bug_introducing_commits(self, repo_dir, tokenized):
        from pydriller import GitRepository
        from pydriller.domain.commit import ModificationType

        logger.info("Download commits to ignore...")
        assert db.download(IGNORED_COMMITS_DB)
        commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))

        logger.info("Download bug-fixing classifications...")
        assert db.download(BUG_FIXING_COMMITS_DB)
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
            if bug_fixing_commit["type"] in ["r", "d"]
        ]

        if tokenized:
            db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
        else:
            db_path = BUG_INTRODUCING_COMMITS_DB

        def git_to_mercurial(revs):
            if tokenized:
                return (self.tokenized_git_to_mercurial[rev] for rev in revs)
            else:
                yield from vcs_map.git_to_mercurial(repo_dir, revs)

        def mercurial_to_git(revs):
            if tokenized:
                return (self.mercurial_to_tokenized_git[rev] for rev in revs)
            else:
                yield from vcs_map.mercurial_to_git(repo_dir, revs)

        logger.info("Download previously found bug-introducing commits...")
        db.download(db_path)

        logger.info("Get previously found bug-introducing commits...")
        prev_bug_introducing_commits = list(db.read(db_path))
        prev_bug_introducing_commits_nodes = set(
            bug_introducing_commit["bug_fixing_rev"]
            for bug_introducing_commit in prev_bug_introducing_commits
        )
        logger.info(
            "Already classified %d commits...", len(prev_bug_introducing_commits)
        )

        hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)

        with open("git_hashes_to_ignore", "w") as f:
            git_hashes = mercurial_to_git(
                commit["rev"]
                for commit in tqdm(commits_to_ignore)
                if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git
            )
            f.writelines("{}\n".format(git_hash) for git_hash in git_hashes)

        logger.info("%d commits to analyze", len(bug_fixing_commits))

        # Skip already found bug-introducing commits.
        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
        ]

        logger.info(
            "%d commits left to analyze after skipping already analyzed ones",
            len(bug_fixing_commits),
        )

        bug_fixing_commits = [
            bug_fixing_commit
            for bug_fixing_commit in bug_fixing_commits
            if bug_fixing_commit["rev"] not in hashes_to_ignore
        ]
        logger.info(
            "%d commits left to analyze after skipping the ones in the ignore list",
            len(bug_fixing_commits),
        )

        if tokenized:
            bug_fixing_commits = [
                bug_fixing_commit
                for bug_fixing_commit in bug_fixing_commits
                if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
            ]
            logger.info(
                "%d commits left to analyze after skipping the ones with no git hash",
                len(bug_fixing_commits),
            )

        git_init_lock = threading.Lock()

        def _init(git_repo_dir):
            with git_init_lock:
                thread_local.git = GitRepository(git_repo_dir)
                # Call get_head in order to make pydriller initialize the repository.
                thread_local.git.get_head()

        def find_bic(bug_fixing_commit):
            logger.info("Analyzing %s...", bug_fixing_commit["rev"])

            git_fix_revision = tuple(mercurial_to_git([bug_fixing_commit["rev"]]))[0]

            commit = thread_local.git.get_commit(git_fix_revision)

            # Skip huge changes, we'll likely be wrong with them.
            if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
                logger.info(
                    "Skipping {} as it is too big".format(bug_fixing_commit["rev"])
                )
                return None

            def get_modification_path(mod):
                path = mod.new_path
                if (
                    mod.change_type == ModificationType.RENAME
                    or mod.change_type == ModificationType.DELETE
                ):
                    path = mod.old_path
                return path

            bug_introducing_modifications = {}
            for modification in commit.modifications:
                path = get_modification_path(modification)

                if path == "testing/web-platform/meta/MANIFEST.json":
                    continue

                # Don't try to find the bug-introducing commit for modifications
                # in the bug-fixing commit to non-source code files.
                if repository.get_type(path) not in repository.SOURCE_CODE_TYPES_TO_EXT:
                    continue

                bug_introducing_modifications.update(
                    thread_local.git.get_commits_last_modified_lines(
                        commit,
                        modification=modification,
                        hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"),
                    )
                )

            logger.info(
                "Found {} for {}".format(
                    bug_introducing_modifications, bug_fixing_commit["rev"]
                )
            )

            bug_introducing_commits = []
            for bug_introducing_hashes in bug_introducing_modifications.values():
                for bug_introducing_hash in bug_introducing_hashes:
                    try:
                        bug_introducing_commits.append(
                            {
                                "bug_fixing_rev": bug_fixing_commit["rev"],
                                "bug_introducing_rev": tuple(
                                    git_to_mercurial([bug_introducing_hash])
                                )[0],
                            }
                        )
                    except Exception as e:
                        # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
                        if not str(e).startswith("Missing git commit in the VCS map"):
                            raise

            # Add an empty result, just so that we don't reanalyze this again.
            if len(bug_introducing_commits) == 0:
                bug_introducing_commits.append(
                    {
                        "bug_fixing_rev": bug_fixing_commit["rev"],
                        "bug_introducing_rev": "",
                    }
                )

            return bug_introducing_commits

        def compress_and_upload():
            zstd_compress(db_path)
            db.upload(db_path)

        workers = os.cpu_count() + 1
        logger.info(
            "Analyzing %d commits using %d workers...",
            len(bug_fixing_commits),
            len(bug_fixing_commits),
        )

        with concurrent.futures.ThreadPoolExecutor(
            initializer=_init, initargs=(repo_dir,), max_workers=workers
        ) as executor:

            def results():
                start_time = time.monotonic()

                futures = {
                    executor.submit(find_bic, bug_fixing_commit): bug_fixing_commit[
                        "rev"
                    ]
                    for bug_fixing_commit in bug_fixing_commits
                }

                for future in tqdm(
                    concurrent.futures.as_completed(futures),
                    total=len(futures),
                ):
                    exc = future.exception()
                    if exc is not None:
                        logger.info(
                            "Exception %s while analyzing %s", exc, futures[future]
                        )
                        for f in futures:
                            f.cancel()

                    result = future.result()
                    if result is not None:
                        yield from result

                    if time.monotonic() - start_time >= 3600:
                        compress_and_upload()
                        start_time = time.monotonic()

            db.append(db_path, results())

        compress_and_upload()