scripts/inline_comments_data_collection.py (150 lines of code) (raw):
import argparse
import logging
import os
import re
import orjson
from libmozdata.phabricator import PhabricatorAPI
from bugbug import db, phabricator
from bugbug.phabricator import fetch_diff_from_url
from bugbug.tools.code_review import PhabricatorReviewData
from bugbug.utils import (
get_secret,
setup_libmozdata,
zstd_compress,
)
review_data = PhabricatorReviewData()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
setup_libmozdata()
api = PhabricatorAPI(get_secret("PHABRICATOR_TOKEN"))
class NoDiffsFoundException(Exception):
def __init__(self, patch_id):
super().__init__(f"No diffs found for the given patch ID: {patch_id}")
self.patch_id = patch_id
class NoTransactionsFoundException(Exception):
def __init__(self, patch_id):
super().__init__(f"No transactions found for the given patch ID: {patch_id}")
self.patch_id = patch_id
class NoDiffFoundForPHIDException(Exception):
def __init__(self, phid):
super().__init__(f"No diff found for PHID {phid}")
self.phid = phid
def load_revisions_maps():
diff_id_to_revision = {}
diff_phid_to_id = {}
for revision in phabricator.get_revisions():
for transaction in revision["transactions"]:
if transaction.get("fields", {}).get("diff") is None:
continue
diff_id_to_revision[transaction["fields"]["diff"]["id"]] = revision
diff_phid_to_id[transaction["fields"]["diff"]["phid"]] = transaction[
"fields"
]["diff"]["id"]
return diff_id_to_revision, diff_phid_to_id
def find_recent_update(transactions, comment_date_modified):
updates = [
transaction
for transaction in transactions
if transaction["type"] == "update"
and transaction["dateModified"] <= comment_date_modified
]
return max(
updates, key=lambda transaction: transaction["dateModified"], default=None
)
def extract_relevant_diff(patch_diff, filename):
file_diff_pattern = rf"diff --git a/{re.escape(filename)} b/{re.escape(filename)}\n.*?(?=\ndiff --git|$)"
match = re.search(file_diff_pattern, patch_diff, re.DOTALL)
if match:
return match.group(0)
else:
return None
def process_comments(limit, diff_length_limit):
patch_count = 0
diff_id_to_revisions_map, diff_phid_to_id = load_revisions_maps()
for patch_id, comments in review_data.get_all_inline_comments(lambda c: True):
revision_info = diff_id_to_revisions_map[patch_id]
transactions = revision_info["transactions"]
resolved_comments = [comment for comment in comments if comment.is_done]
if not resolved_comments:
continue
for comment in comments:
comment_date_modified = comment.date_modified
most_recent_update = find_recent_update(transactions, comment_date_modified)
if not most_recent_update:
continue
try:
fix_patch_id = diff_phid_to_id[most_recent_update["fields"]["new"]]
except KeyError:
diffs = api.search_diffs(diff_phid=most_recent_update["fields"]["new"])
if not diffs:
raise NoDiffFoundForPHIDException(
most_recent_update["fields"]["new"]
)
fix_patch_id = diffs[0]["id"]
# If the most recent patch is the original patch itself, skip it
if fix_patch_id == patch_id:
continue
revision_phid = revision_info["phid"]
revision_id = revision_info["id"]
bug_id = revision_info["fields"]["bugzilla.bug-id"]
try:
previous_patch_id = diff_phid_to_id[most_recent_update["fields"]["old"]]
except Exception:
diffs = api.search_diffs(diff_phid=most_recent_update["fields"]["old"])
if not diffs:
raise NoDiffFoundForPHIDException(
most_recent_update["fields"]["old"]
)
previous_patch_id = diffs[0]["id"]
try:
patch_diff = fetch_diff_from_url(
revision_id, previous_patch_id, fix_patch_id
)
except Exception as e:
logger.error(f"Failed to fetch diff: {e}")
continue
if len(patch_diff) > diff_length_limit:
continue
relevant_diff = extract_relevant_diff(patch_diff, comment.filename)
if relevant_diff:
data = {
"bug_id": bug_id,
"revision_id": revision_id,
"revision_phid": revision_phid,
"initial_patch_id": patch_id,
"fix_patch_id": fix_patch_id,
"previous_patch_id": previous_patch_id,
"comment": comment.__dict__,
"fix_patch_diff": relevant_diff,
}
yield data
patch_count += 1
if patch_count >= limit:
break
def main():
parser = argparse.ArgumentParser(description="Process patch reviews.")
parser.add_argument(
"--limit",
type=int,
default=None,
help="Limit the number of patches to process. No limit if not specified.",
)
parser.add_argument(
"--diff-length-limit",
type=int,
default=10000,
help="Limit the maximum allowed diff length. Default 10000 if not specified.",
)
args = parser.parse_args()
limit = args.limit or float("inf")
diff_length_limit = args.diff_length_limit or float("inf")
os.makedirs("patches", exist_ok=True)
db.download(phabricator.REVISIONS_DB)
with open(phabricator.FIXED_COMMENTS_DB, "wb") as dataset_file_handle:
for data in process_comments(
limit=limit,
diff_length_limit=diff_length_limit,
):
dataset_file_handle.write(orjson.dumps(data) + b"\n")
zstd_compress(phabricator.FIXED_COMMENTS_DB)
if __name__ == "__main__":
main()