scripts/past_bugs_by_unit.py (189 lines of code) (raw):
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
import json
import logging
from collections import defaultdict
from tqdm import tqdm
from bugbug import bugzilla, db, repository
from bugbug.models.regressor import BUG_FIXING_COMMITS_DB
from bugbug.utils import zstd_compress
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PastBugsCollector(object):
def __init__(self) -> None:
logger.info("Downloading commits database...")
assert db.download(repository.COMMITS_DB)
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)
logger.info("Download commit classifications...")
assert db.download(BUG_FIXING_COMMITS_DB)
def go(self) -> None:
logger.info(
"Generate map of bug ID -> bug data for all bugs which were defects"
)
bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
bug_fixing_commits_nodes = set(
bug_fixing_commit["rev"]
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["type"] in ("d", "r")
)
logger.info("%d bug-fixing commits to analyze", len(bug_fixing_commits_nodes))
all_bug_ids = set(commit["bug_id"] for commit in repository.get_commits())
bug_map = {
bug["id"]: bug for bug in bugzilla.get_bugs() if bug["id"] in all_bug_ids
}
logger.info(
"Generate a map from files/functions to the bugs which were fixed/introduced by touching them"
)
# TODO: Support "moving" past bugs between files when they are renamed and between functions when they are
# moved across files.
by_dimensions = ["file", "directory", "component"]
def dimension_to_field(dimension: str) -> str:
return f"{dimension}s" if dimension != "directory" else "directories"
past_regressions_by: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_fixed_bugs_by: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_regression_blocked_bugs_by: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_fixed_bug_blocked_bugs_by: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_regressions_by_function: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_fixed_bugs_by_function: dict[str, dict[str, list[int]]] = defaultdict(
lambda: defaultdict(list)
)
past_regression_blocked_bugs_by_function: dict[str, dict[str, list[int]]] = (
defaultdict(lambda: defaultdict(list))
)
past_fixed_bug_blocked_bugs_by_function: dict[str, dict[str, list[int]]] = (
defaultdict(lambda: defaultdict(list))
)
for commit in tqdm(repository.get_commits()):
if commit["bug_id"] not in bug_map:
continue
if commit["backedoutby"]:
continue
bug = bug_map[commit["bug_id"]]
if len(bug["regressions"]) > 0:
for dimension in by_dimensions:
for path in commit[dimension_to_field(dimension)]:
past_regressions_by[dimension][path].extend(
bug_id for bug_id in bug["regressions"] if bug_id in bug_map
)
past_regression_blocked_bugs_by[dimension][path].extend(
bugzilla.find_blocked_by(bug_map, bug)
)
for path, f_group in commit["functions"].items():
for f in f_group:
past_regressions_by_function[path][f["name"]].extend(
bug_id for bug_id in bug["regressions"] if bug_id in bug_map
)
past_regression_blocked_bugs_by_function[path][
f["name"]
].extend(bugzilla.find_blocked_by(bug_map, bug))
if commit["node"] in bug_fixing_commits_nodes:
for dimension in by_dimensions:
for path in commit[dimension_to_field(dimension)]:
past_fixed_bugs_by[dimension][path].append(bug["id"])
past_fixed_bug_blocked_bugs_by[dimension][path].extend(
bugzilla.find_blocked_by(bug_map, bug)
)
for path, f_group in commit["functions"].items():
for f in f_group:
past_fixed_bugs_by_function[path][f["name"]].append(bug["id"])
past_fixed_bug_blocked_bugs_by_function[path][f["name"]].extend(
bugzilla.find_blocked_by(bug_map, bug)
)
def _transform(bug_ids: list[int]) -> list[dict]:
seen = set()
results = []
for bug_id in bug_ids:
if bug_id in seen:
continue
seen.add(bug_id)
bug = bug_map[bug_id]
results.append(
{
"id": bug_id,
"summary": bug["summary"],
"component": "{}::{}".format(bug["product"], bug["component"]),
}
)
return results
def past_bug_ids_to_summaries(
past_bugs_by: dict[str, list[int]],
) -> dict[str, list[dict]]:
return {path: _transform(bug_ids) for path, bug_ids in past_bugs_by.items()}
for dimension in by_dimensions:
with open(f"data/past_regressions_by_{dimension}.json", "w") as f:
json.dump(past_bug_ids_to_summaries(past_regressions_by[dimension]), f)
zstd_compress(f"data/past_regressions_by_{dimension}.json")
with open(f"data/past_fixed_bugs_by_{dimension}.json", "w") as f:
json.dump(past_bug_ids_to_summaries(past_fixed_bugs_by[dimension]), f)
zstd_compress(f"data/past_fixed_bugs_by_{dimension}.json")
with open(
f"data/past_regression_blocked_bugs_by_{dimension}.json", "w"
) as f:
json.dump(
past_bug_ids_to_summaries(
past_regression_blocked_bugs_by[dimension]
),
f,
)
zstd_compress(f"data/past_regression_blocked_bugs_by_{dimension}.json")
with open(
f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json", "w"
) as f:
json.dump(
past_bug_ids_to_summaries(
past_fixed_bug_blocked_bugs_by[dimension]
),
f,
)
zstd_compress(f"data/past_fixed_bug_blocked_bugs_by_{dimension}.json")
def past_function_bug_ids_to_summaries(
past_bugs: dict[str, dict[str, list[int]]],
) -> dict[str, dict[str, list[dict]]]:
return {
path: {
func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items()
}
for path, funcs_bugs in past_bugs.items()
}
with open("data/past_regressions_by_function.json", "w") as f:
json.dump(
past_function_bug_ids_to_summaries(past_regressions_by_function), f
)
zstd_compress("data/past_regressions_by_function.json")
with open("data/past_fixed_bugs_by_function.json", "w") as f:
json.dump(
past_function_bug_ids_to_summaries(past_fixed_bugs_by_function), f
)
zstd_compress("data/past_fixed_bugs_by_function.json")
with open("data/past_regression_blocked_bugs_by_function.json", "w") as f:
json.dump(
past_function_bug_ids_to_summaries(
past_regression_blocked_bugs_by_function
),
f,
)
zstd_compress("data/past_regression_blocked_bugs_by_function.json")
with open("data/past_fixed_bug_blocked_bugs_by_function.json", "w") as f:
json.dump(
past_function_bug_ids_to_summaries(
past_fixed_bug_blocked_bugs_by_function
),
f,
)
zstd_compress("data/past_fixed_bug_blocked_bugs_by_function.json")
def main() -> None:
description = "Find past bugs linked to given units of source code"
parser = argparse.ArgumentParser(description=description)
parser.parse_args()
past_bugs_collector = PastBugsCollector()
past_bugs_collector.go()
if __name__ == "__main__":
main()