in scripts/regressor_finder.py [0:0]
def find_bug_introducing_commits(self, repo_dir, tokenized):
from pydriller import GitRepository
from pydriller.domain.commit import ModificationType
logger.info("Download commits to ignore...")
assert db.download(IGNORED_COMMITS_DB)
commits_to_ignore = list(db.read(IGNORED_COMMITS_DB))
logger.info("Download bug-fixing classifications...")
assert db.download(BUG_FIXING_COMMITS_DB)
bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in db.read(BUG_FIXING_COMMITS_DB)
if bug_fixing_commit["type"] in ["r", "d"]
]
if tokenized:
db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB
else:
db_path = BUG_INTRODUCING_COMMITS_DB
def git_to_mercurial(revs):
if tokenized:
return (self.tokenized_git_to_mercurial[rev] for rev in revs)
else:
yield from vcs_map.git_to_mercurial(repo_dir, revs)
def mercurial_to_git(revs):
if tokenized:
return (self.mercurial_to_tokenized_git[rev] for rev in revs)
else:
yield from vcs_map.mercurial_to_git(repo_dir, revs)
logger.info("Download previously found bug-introducing commits...")
db.download(db_path)
logger.info("Get previously found bug-introducing commits...")
prev_bug_introducing_commits = list(db.read(db_path))
prev_bug_introducing_commits_nodes = set(
bug_introducing_commit["bug_fixing_rev"]
for bug_introducing_commit in prev_bug_introducing_commits
)
logger.info(
"Already classified %d commits...", len(prev_bug_introducing_commits)
)
hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore)
with open("git_hashes_to_ignore", "w") as f:
git_hashes = mercurial_to_git(
commit["rev"]
for commit in tqdm(commits_to_ignore)
if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git
)
f.writelines("{}\n".format(git_hash) for git_hash in git_hashes)
logger.info("%d commits to analyze", len(bug_fixing_commits))
# Skip already found bug-introducing commits.
bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes
]
logger.info(
"%d commits left to analyze after skipping already analyzed ones",
len(bug_fixing_commits),
)
bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["rev"] not in hashes_to_ignore
]
logger.info(
"%d commits left to analyze after skipping the ones in the ignore list",
len(bug_fixing_commits),
)
if tokenized:
bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git
]
logger.info(
"%d commits left to analyze after skipping the ones with no git hash",
len(bug_fixing_commits),
)
git_init_lock = threading.Lock()
def _init(git_repo_dir):
with git_init_lock:
thread_local.git = GitRepository(git_repo_dir)
# Call get_head in order to make pydriller initialize the repository.
thread_local.git.get_head()
def find_bic(bug_fixing_commit):
logger.info("Analyzing %s...", bug_fixing_commit["rev"])
git_fix_revision = tuple(mercurial_to_git([bug_fixing_commit["rev"]]))[0]
commit = thread_local.git.get_commit(git_fix_revision)
# Skip huge changes, we'll likely be wrong with them.
if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
logger.info(
"Skipping {} as it is too big".format(bug_fixing_commit["rev"])
)
return None
def get_modification_path(mod):
path = mod.new_path
if (
mod.change_type == ModificationType.RENAME
or mod.change_type == ModificationType.DELETE
):
path = mod.old_path
return path
bug_introducing_modifications = {}
for modification in commit.modifications:
path = get_modification_path(modification)
if path == "testing/web-platform/meta/MANIFEST.json":
continue
# Don't try to find the bug-introducing commit for modifications
# in the bug-fixing commit to non-source code files.
if repository.get_type(path) not in repository.SOURCE_CODE_TYPES_TO_EXT:
continue
bug_introducing_modifications.update(
thread_local.git.get_commits_last_modified_lines(
commit,
modification=modification,
hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore"),
)
)
logger.info(
"Found {} for {}".format(
bug_introducing_modifications, bug_fixing_commit["rev"]
)
)
bug_introducing_commits = []
for bug_introducing_hashes in bug_introducing_modifications.values():
for bug_introducing_hash in bug_introducing_hashes:
try:
bug_introducing_commits.append(
{
"bug_fixing_rev": bug_fixing_commit["rev"],
"bug_introducing_rev": tuple(
git_to_mercurial([bug_introducing_hash])
)[0],
}
)
except Exception as e:
# Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard").
if not str(e).startswith("Missing git commit in the VCS map"):
raise
# Add an empty result, just so that we don't reanalyze this again.
if len(bug_introducing_commits) == 0:
bug_introducing_commits.append(
{
"bug_fixing_rev": bug_fixing_commit["rev"],
"bug_introducing_rev": "",
}
)
return bug_introducing_commits
def compress_and_upload():
zstd_compress(db_path)
db.upload(db_path)
workers = os.cpu_count() + 1
logger.info(
"Analyzing %d commits using %d workers...",
len(bug_fixing_commits),
len(bug_fixing_commits),
)
with concurrent.futures.ThreadPoolExecutor(
initializer=_init, initargs=(repo_dir,), max_workers=workers
) as executor:
def results():
start_time = time.monotonic()
futures = {
executor.submit(find_bic, bug_fixing_commit): bug_fixing_commit[
"rev"
]
for bug_fixing_commit in bug_fixing_commits
}
for future in tqdm(
concurrent.futures.as_completed(futures),
total=len(futures),
):
exc = future.exception()
if exc is not None:
logger.info(
"Exception %s while analyzing %s", exc, futures[future]
)
for f in futures:
f.cancel()
result = future.result()
if result is not None:
yield from result
if time.monotonic() - start_time >= 3600:
compress_and_upload()
start_time = time.monotonic()
db.append(db_path, results())
compress_and_upload()