scripts/code_review_tool_evaluator.py (474 lines of code) (raw):
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
"""This script evaluates different variants of the code review tool.
Before running this script, you may need to set the following environment
variables:
- BUGBUG_PHABRICATOR_URL
- BUGBUG_PHABRICATOR_TOKEN
- BUGBUG_*_API_KEY (replace * with your LLM provider)
- BUGBUG_QDRANT_API_KEY
- BUGBUG_QDRANT_LOCATION
To specify different variants to evaluate, please modify the get_tool_variants
function.
"""
import os
from collections import defaultdict
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence
from tabulate import tabulate
from bugbug import db, generative_model_tool, phabricator, utils
from bugbug.code_search.mozilla import FunctionSearchMozilla
from bugbug.tools import code_review
from bugbug.vectordb import QdrantVectorDB
code_review.TARGET_SOFTWARE = "Mozilla Firefox"
VERBOSE_CODE_REVIEW = False
EVALUATION_TEMPLATE = """Your are an expert in code review at Mozilla Firefox.
**Task**:
Match two sets of code review comments to identify redundant comments.
**Instructions**:
1. **Consider the following about all comments**:
- The comments are related to the same code patch.
- The comments may be written in different styles.
2. **Understand what each comment is addressing**:
- Read the comments in both sets.
- Understand the issue that each comment is addressing.
3. **Check for matches**:
- If you find a comment in the old set that is addressing the same issue as a comment in the new set, link them as redundant.
- The comments may not be identical, but they should be addressing the same issue.
- The level of detail in the comments may vary.
4. **Output format**:
- Output a list of matched comments.
- Include the comment IDs only in the output.
- Each element in the list should be an object with two keys: `old_comment_id` and `new_comment_id`.
- No explanation is needed in the output, only the IDs of the matched comments.
- The output should be a valid json only.
**Output example**:
[
{{"old_comment_id": 1, "new_comment_id": 3}},
{{"old_comment_id": 4, "new_comment_id": 2}},
]
**First set of comments (old comments)**:
{old_comments}
**Second set of comments (new comments)**:
{new_comments}
"""
class FeedbackEvaluator:
def __init__(self, evaluation_dataset: str):
self.evaluated_comments = pd.read_csv(evaluation_dataset)
llm = generative_model_tool.create_openai_llm()
evaluate_comments_prompt = PromptTemplate.from_template(EVALUATION_TEMPLATE)
self.evaluation_chain = RunnableSequence(evaluate_comments_prompt, llm)
def evaluate_diff_comments(
self,
diff_id: int,
new_comments: list[code_review.InlineComment],
) -> list[dict]:
diff_evaluated_comments = self.evaluated_comments[
self.evaluated_comments["diff_id"] == diff_id
].reset_index()
diff_evaluated_comments["evaluation"] = np.where(
diff_evaluated_comments["evaluation"].isin(["CORRECT", "VALID_REDUNDANT"]),
"VALID",
"INVALID",
)
output = self.evaluation_chain.invoke(
{
"old_comments": [
{
"id": i,
"content": raw["comment"],
"file": raw["file_path"],
}
for i, raw in diff_evaluated_comments.iterrows()
],
"new_comments": [
{
"id": i,
"content": comment.content,
"file": comment.filename,
}
for i, comment in enumerate(new_comments)
],
}
)
matches = code_review.parse_model_output(output.content)
results = [
{
"new_comment": comment.content,
"old_comments_count": 0,
"matched": False,
}
for comment in new_comments
]
seen_old_comments = set()
for match in matches:
old_index = match["old_comment_id"]
new_index = match["new_comment_id"]
evaluated_comment = diff_evaluated_comments.iloc[old_index]
new_comment = new_comments[new_index]
if evaluated_comment["file_path"] != new_comment.filename:
print(
"File mismatch:",
evaluated_comment["file_path"],
new_comment.filename,
)
continue
current_result = results[new_index]
current_result["evaluation"] = (
"MIXED"
if (
"evaluation" in current_result
and current_result["evaluation"] != evaluated_comment["evaluation"]
)
else evaluated_comment["evaluation"]
)
if "old_comment" in current_result:
current_result["old_comment"] += (
f"\n\n-------------\n\n{evaluated_comment['comment']}"
)
else:
current_result["old_comment"] = evaluated_comment["comment"]
current_result["old_comments_count"] += 1
current_result["matched"] = True
seen_old_comments.add(old_index)
for i, raw in diff_evaluated_comments.iterrows():
if i in seen_old_comments:
continue
results.append(
{
"old_comment": raw["comment"],
"evaluation": raw["evaluation"],
"old_comments_count": 1,
}
)
self.print_evaluation_matches(results)
return results
@staticmethod
def print_evaluation_matches(matching_results: list[dict]):
print(
tabulate(
[
(
result.get("new_comment", ""),
result.get("old_comment", ""),
result.get("evaluation", ""),
)
for result in matching_results
],
tablefmt="mixed_grid",
headers=[
"New Comment",
"Old Comment",
"Evaluation",
],
maxcolwidths=[
60,
60,
20,
],
)
)
def get_tool_variants(
llm,
variants: list[str] | None = None,
) -> list[tuple[str, code_review.CodeReviewTool]]:
"""Returns a list of tool variants to evaluate.
Returns:
List of tuples, where each tuple contains the name of the variant and
and instance of the code review tool to evaluate.
"""
def is_variant_selected(*target_variants):
return variants is None or any(
target_variant in variants for target_variant in target_variants
)
# Step 1: we start with instantiating the dependencies based on the selected
# variants.
if is_variant_selected(
"CONTEXT", "RAG and CONTEXT", "RAG and CONTEXT and REJECTED_COMMENTS"
):
def get_file(commit_hash, path):
r = utils.get_session("hgmo").get(
f"https://hg.mozilla.org/mozilla-unified/raw-file/{commit_hash}/{path}",
headers={
"User-Agent": utils.get_user_agent(),
},
)
r.raise_for_status()
return r.text
repo_dir = "../mozilla-unified"
function_search = FunctionSearchMozilla(repo_dir, get_file, True)
if is_variant_selected(
"RAG", "RAG and CONTEXT", "RAG and CONTEXT and REJECTED_COMMENTS", "llm-gpt-4.1"
):
review_comments_db = code_review.ReviewCommentsDB(
QdrantVectorDB("diff_comments")
)
if is_variant_selected("RAG and CONTEXT and REJECTED_COMMENTS", "llm-gpt-4.1"):
suggestions_feedback_db = code_review.SuggestionsFeedbackDB(
QdrantVectorDB("suggestions_feedback")
)
# Step 2: we create the selected tool variants.
tool_variants = []
if is_variant_selected("RAG"):
tool_variants.append(
(
"RAG",
code_review.CodeReviewTool(
comment_gen_llms=[llm],
function_search=None,
review_comments_db=review_comments_db,
verbose=VERBOSE_CODE_REVIEW,
),
)
)
if is_variant_selected("CONTEXT"):
tool_variants.append(
(
"CONTEXT",
code_review.CodeReviewTool(
comment_gen_llms=[llm],
function_search=function_search,
review_comments_db=None,
verbose=VERBOSE_CODE_REVIEW,
),
)
)
if is_variant_selected("RAG and CONTEXT"):
tool_variants.append(
(
"RAG and CONTEXT",
code_review.CodeReviewTool(
comment_gen_llms=[llm],
function_search=function_search,
review_comments_db=review_comments_db,
verbose=VERBOSE_CODE_REVIEW,
),
)
)
if is_variant_selected("RAG and CONTEXT and REJECTED_COMMENTS"):
tool_variants.append(
(
"RAG and CONTEXT and REJECTED_COMMENTS",
code_review.CodeReviewTool(
comment_gen_llms=[llm],
function_search=function_search,
review_comments_db=review_comments_db,
suggestions_feedback_db=suggestions_feedback_db,
verbose=VERBOSE_CODE_REVIEW,
),
),
)
if is_variant_selected("llm-gpt-4.1"):
tool_variants.append(
(
"llm-gpt-4.1",
code_review.CodeReviewTool(
comment_gen_llms=[
generative_model_tool.create_openai_llm(
model_name="gpt-4.1-2025-04-14"
)
],
# function_search=function_search,
review_comments_db=review_comments_db,
suggestions_feedback_db=suggestions_feedback_db,
verbose=VERBOSE_CODE_REVIEW,
),
)
)
return tool_variants
def get_review_requests_sample(since: timedelta, limit: int):
assert db.download(phabricator.REVISIONS_DB)
start_date = (datetime.now() - since).timestamp()
MOZILLA_CENTRAL_PHID = "PHID-REPO-saax4qdxlbbhahhp2kg5"
n = 0
for review_request in phabricator.get_revisions():
if (
review_request["fields"]["repositoryPHID"] != MOZILLA_CENTRAL_PHID
or review_request["fields"]["dateCreated"] <= start_date
):
continue
if n >= limit >= 0:
break
yield review_request["id"]
n += 1
def print_prettified_comments(comments: list[code_review.InlineComment]):
if not comments:
print("No comments to show.")
return
print(
tabulate(
[
(
comment.filename,
comment.end_line,
comment.content,
)
for comment in comments
],
headers=[
"File",
"Line",
"Comment",
],
maxcolwidths=[
30,
10,
100,
],
),
)
def get_latest_evaluation_results_file(results_dir: str | None):
import glob
import os
files = glob.glob("evaluation_results_*#*.csv", root_dir=results_dir)
if not files:
raise FileNotFoundError("No evaluation results file found.")
latests_files = max(files)
if results_dir:
return os.path.join(results_dir, latests_files)
return latests_files
def main(args):
review_platform = "phabricator"
review_data: code_review.ReviewData = code_review.review_data_classes[
review_platform
]()
tool_variants = get_tool_variants(
generative_model_tool.create_llm_from_args(args), args.variants
)
evaluator = FeedbackEvaluator(args.evaluation_dataset)
is_first_result = True
result_file = os.path.join(
args.results_dir,
"code_review_tool_evaluator.csv",
)
evaluation_results_file = os.path.join(
args.results_dir,
f"evaluation_results_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv",
)
result_unique_columns = ["Review Request ID", "File", "Line", "Comment Number"]
result_all_columns = result_unique_columns + [
f"{title} ({variant_name})"
for variant_name, _ in tool_variants
for title in ("Comment", "Evaluation")
]
evaluation_result_all_columns = [
"variant_name",
"revision_id",
"diff_id",
"new_comment",
"old_comments_count",
"matched",
"old_comment",
"evaluation",
]
selected_review_requests = []
if args.diff_ids:
selected_review_requests = (
("n/a", code_review.ReviewRequest(diff_id)) for diff_id in args.diff_ids
)
elif args.review_request_ids:
selected_review_requests = (
(review_request_id, review_data.get_review_request_by_id(review_request_id))
for review_request_id in args.review_request_ids
)
elif args.evaluation_strategy == "random":
print("No review request IDs specified. Selecting a random sample.")
selected_review_requests = (
(revision_id, code_review.ReviewRequest(diff_id))
for revision_id, diff_id in evaluator.evaluated_comments.query(
"evaluation == 'CORRECT'"
)[["revision_id", "diff_id"]]
.drop_duplicates()
.sample(20)
.itertuples(index=False)
)
elif args.evaluation_strategy == "same":
selected_review_requests = (
(revision_id, code_review.ReviewRequest(diff_id))
for revision_id, diff_id in pd.read_csv(
get_latest_evaluation_results_file(args.results_dir),
)[["revision_id", "diff_id"]]
.drop_duplicates()
.itertuples(name=None, index=False)
)
else:
raise ValueError(
"Please specify either --diff-id or --revision-id. Alternatively, use --evaluation-strategy."
)
for review_request_id, review_request in selected_review_requests:
print("---------------------------------------------------------")
print(f"Review Request ID: {review_request_id}")
print(f"Patch ID: {review_request.patch_id}")
patch = review_data.get_patch_by_id(review_request.patch_id)
print("---------------------------------------------------------")
if len(patch.raw_diff) > 20_000:
print("Skipping the patch because it is too large.")
continue
all_variants_results = []
all_variants_evaluation_results = []
for variant_name, tool in tool_variants:
print(f"\n\nVariant: {variant_name}\n")
try:
comments = tool.run(patch)
except code_review.FileNotInPatchError as e:
print("Error while running the tool:", e)
continue
except code_review.LargeDiffError:
print("Skipping the patch because it is too large.")
continue
print_prettified_comments(comments)
comment_per_line_counter = defaultdict(int)
evaluation = evaluator.evaluate_diff_comments(
review_request.patch_id, comments
)
all_variants_evaluation_results.extend(
{
"variant_name": variant_name,
"revision_id": review_request_id,
"diff_id": review_request.patch_id,
**row,
}
for row in evaluation
)
for i, comment in enumerate(comments):
key = (review_request_id, comment.filename, comment.end_line)
comment_per_line_counter[key] += 1
all_variants_results.append(
{
"Review Request ID": review_request_id,
"File": comment.filename,
"Line": comment.end_line,
"Comment Number": comment_per_line_counter[key],
f"Comment ({variant_name})": comment.content,
f"Evaluation ({variant_name})": evaluation[i].get("evaluation"),
}
)
df = (
pd.DataFrame(all_variants_results, columns=result_all_columns)
.groupby(result_unique_columns)
.first()
)
df.to_csv(
result_file,
header=is_first_result,
mode="w" if is_first_result else "a",
)
df = pd.DataFrame(
all_variants_evaluation_results, columns=evaluation_result_all_columns
)
df.to_csv(
evaluation_results_file,
index=False,
header=is_first_result,
mode="w" if is_first_result else "a",
)
if is_first_result:
is_first_result = False
print("You can find the results in the file:", result_file)
print("\n\n\n")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
generative_model_tool.create_llm_to_args(parser)
parser.add_argument(
"-v",
"--variant",
dest="variants",
action="append",
help="if specified, run only the selected variant(s)",
metavar="VARIANT",
)
parser.add_argument(
"-r",
"--revision-id",
dest="review_request_ids",
action="append",
help="if specified, run only the selected Revision ID(s)",
metavar="REVISION_ID",
type=int,
)
parser.add_argument(
"-d",
"--diff-id",
dest="diff_ids",
action="append",
help="if specified, run only the selected Diff ID(s)",
metavar="DIFF_ID",
type=int,
)
parser.add_argument(
"--evaluation-data",
dest="evaluation_dataset",
action="store",
help="the path or the URL to a evaluation dataset in CSV format",
)
parser.add_argument(
"--results-dir",
dest="results_dir",
action="store",
help="the directory to store the results and read previous results",
)
parser.add_argument(
"--evaluation-strategy",
dest="evaluation_strategy",
action="store",
help="the evaluation strategy to use",
)
args = parser.parse_args()
if args.diff_ids and args.review_request_ids:
parser.error("Please specify either --diff-id or --revision-id, not both.")
main(args)