backend/code_review_backend/issues/management/commands/load_in_patch.py (76 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
from multiprocessing import Pool
import requests
from django import db
from django.core.management.base import BaseCommand
from parsepatch.patch import Patch
from code_review_backend.app.settings import BACKEND_USER_AGENT
from code_review_backend.issues.models import Diff, IssueLink
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_hgmo_patch(diff):
# Load the parent info as we have the try-task-config commit
url = f"{diff.repository.url}/json-rev/{diff.mercurial_hash}"
logging.info(f"Downloading {url}")
resp = requests.get(url, headers={"user-agent": BACKEND_USER_AGENT})
resp.raise_for_status()
meta = resp.json()
if meta["desc"].startswith("try_task_config"):
patch_rev = resp.json()["parents"][0]
else:
patch_rev = diff.mercurial_hash
# Load the parent patch
url = f"{diff.repository.url}/raw-rev/{patch_rev}"
logging.info(f"Downloading {url}")
resp = requests.get(url, headers={"user-agent": BACKEND_USER_AGENT})
resp.raise_for_status()
patch = Patch.parse_patch(resp.text, skip_comments=False)
assert patch != {}, "Empty patch"
lines = {
# Use all changes in new files
filename: diff.get("touched", []) + diff.get("added", [])
for filename, diff in patch.items()
}
return lines
def detect_in_patch(issue_link, lines):
"""From the code-review bot revisions.py contains() method"""
modified_lines = lines.get(issue_link.issue.path)
if modified_lines is None:
# File not in patch
issue_link.in_patch = False
elif issue_link.issue.line is None:
# Empty line means full file
issue_link.in_patch = True
else:
# Detect if this issue is in the patch
chunk_lines = set(
range(
issue_link.issue.line, issue_link.issue.line + issue_link.issue.nb_lines
)
)
issue_link.in_patch = not chunk_lines.isdisjoint(modified_lines)
return issue_link
def process_diff(diff: Diff):
"""This function needs to be on the top level in order to be usable by the pool"""
try:
lines = load_hgmo_patch(diff)
issue_links = [
detect_in_patch(issue_link, lines) for issue_link in diff.issue_links.all()
]
logging.info(
f"Found {len([i for i in issue_links if i.in_patch])} issue link in patch for {diff.id}"
)
IssueLink.objects.bulk_update(issue_links, ["in_patch"])
except Exception as e:
logging.info(f"Failure on diff {diff.id}: {e}")
class Command(BaseCommand):
help = "Load issues from remote taskcluster reports"
def add_arguments(self, parser):
parser.add_argument(
"--nb-processes",
type=int,
help="Number of processes used to process the diffs",
default=1,
)
def handle(self, *args, **options):
# Only apply on diffs with issues that are not already processed
diffs = (
Diff.objects.filter(issue_links__in_patch__isnull=True)
.order_by("id")
.distinct()
)
logger.debug(f"Will process {diffs.count()} diffs")
# Close all DB connection so each process get its own
db.connections.close_all()
# Process all the diffs in parallel
with Pool(processes=options["nb_processes"]) as pool:
pool.map(process_diff, diffs, chunksize=20)