bot/code_review_bot/revisions.py (383 lines of code) (raw):
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import random
import time
import urllib.parse
from datetime import timedelta
from pathlib import Path
import requests
import rs_parsepatch
import structlog
from libmozdata.phabricator import PhabricatorAPI
from code_review_bot import Issue, stats, taskcluster
from code_review_bot.config import (
REPO_AUTOLAND,
REPO_MOZILLA_CENTRAL,
GetAppUserAgent,
settings,
)
from code_review_bot.tasks.base import AnalysisTask
logger = structlog.get_logger(__name__)
class ImprovementPatch:
"""
An improvement patch built by the bot
"""
def __init__(self, analyzer, patch_name, content):
assert isinstance(analyzer, AnalysisTask)
# Build name from analyzer and revision
self.analyzer = analyzer
self.name = f"{self.analyzer.name}-{patch_name}.diff"
self.content = content
self.url = None
self.path = None
def __str__(self):
return f"{self.analyzer.name}: {self.url or self.path or self.name}"
def write(self):
"""
Write patch on local FS, for dev & tests only
"""
self.path = os.path.join(settings.taskcluster.results_dir, self.name)
with open(self.path, "w") as f:
length = f.write(self.content)
logger.info("Improvement patch saved", path=self.path, length=length)
def publish(self, days_ttl=30):
"""
Push through Taskcluster API to setup the content-type header
so it displays nicely in browsers
"""
assert (
not settings.taskcluster.local
), "Only publish on online Taskcluster tasks"
self.url = taskcluster.upload_artifact(
f"public/patch/{self.name}",
self.content.encode(),
content_type="text/plain; charset=utf-8", # Displays instead of download
ttl=timedelta(days=days_ttl - 1),
)
logger.info("Improvement patch published", url=self.url)
class Revision:
"""
A Phabricator revision to analyze and report on
"""
def __init__(
self,
phabricator_id=None,
phabricator_phid=None,
diff_id=None,
diff_phid=None,
revision=None,
diff=None,
build_target_phid=None,
head_changeset=None,
base_changeset=None,
head_repository=None,
repository_try_name=None,
base_repository=None,
base_repository_conf=None,
phabricator_repository=None,
url=None,
patch=None,
):
# Identification
self.phabricator_id = phabricator_id
self.phabricator_phid = phabricator_phid
self.diff_id = diff_id
self.diff_phid = diff_phid
self.build_target_phid = build_target_phid
self.head_changeset = head_changeset
self.base_changeset = base_changeset
self.revision = revision
self.diff = diff
self.url = url
# a try repo where the revision is stored
self.head_repository = head_repository
# the name of the try repo where the revision is stored
self.repository_try_name = repository_try_name
# the target repo where the patch may land
self.base_repository = base_repository
# the target repo configuration where the patch may land
self.base_repository_conf = base_repository_conf
# the phabricator repository payload for later identification
self.phabricator_repository = phabricator_repository
# backend's returned URL to list or create issues linked to the revision in bulk (diff is optional)
self.issues_url = None
# Patches built later on
self.improvement_patches = []
# Patch analysis
self.patch = patch
self.files = []
self.lines = {}
@property
def namespaces(self):
return [
f"phabricator.{self.phabricator_id}",
f"phabricator.diff.{self.diff_id}",
f"phabricator.phabricator_phid.{self.phabricator_phid}",
f"phabricator.diffphid.{self.diff_phid}",
]
@property
def before_after_feature(self):
"""
Randomly run the before/after feature depending on a configured ratio.
All the diffs of a revision must be analysed with or without the feature.
"""
if getattr(self, "id", None) is None:
logger.debug(
"Backend ID must be set to determine if using the before/after feature. Skipping."
)
return False
# Set random module pseudo-random seed based on the revision ID to
# ensure that successive calls to random.random will return deterministic values
random.seed(self.id)
return random.random() < taskcluster.secrets.get("BEFORE_AFTER_RATIO", 0)
# Reset random module seed to prevent deterministic values after calling that function
random.seed(os.urandom(128))
def __repr__(self):
if self.diff_phid:
# Most revisions have a Diff from Phabricator
return self.diff_phid or "Unidentified revision"
elif self.head_changeset:
# Autoland revisions have no diff
return f"{self.head_changeset}@{self.head_repository}"
else:
# Fallback
return "Unknown revision"
def __str__(self):
return f"Phabricator #{self.diff_id} - {self.diff_phid}"
@staticmethod
def from_try_task(try_task: dict, decision_task: dict, phabricator: PhabricatorAPI):
"""
Load identifiers from Phabricator, using the remote task description
"""
# Load build target phid from the task env
code_review = try_task["extra"]["code-review"]
build_target_phid = code_review.get("phabricator-diff") or code_review.get(
"phabricator-build-target"
)
assert (
build_target_phid is not None
), "Missing phabricator-build-target or phabricator-diff declaration"
assert build_target_phid.startswith("PHID-HMBT-")
# And get the diff from the phabricator api
buildable = phabricator.find_target_buildable(build_target_phid)
diff_phid = buildable["fields"]["objectPHID"]
assert diff_phid.startswith("PHID-DIFF-")
# Load diff details to get the diff revision
# We also load the commits list in order to get the email of the author of the
# patch for sending email if builds are failing.
diffs = phabricator.search_diffs(
diff_phid=diff_phid, attachments={"commits": True}
)
assert len(diffs) == 1, f"No diff available for {diff_phid}"
diff = diffs[0]
diff_id = diff["id"]
phid = diff["revisionPHID"]
revision = phabricator.load_revision(phid)
# Load repository detailed information
repos = phabricator.request(
"diffusion.repository.search",
constraints={"phids": [revision["fields"]["repositoryPHID"]]},
)
assert len(repos["data"]) == 1, "Repository not found on Phabricator"
# Load target patch from Phabricator for Try mode
patch = phabricator.load_raw_diff(diff_id)
# The parent decision task should exist
assert decision_task is not None, "Missing parent decision task"
logger.info("Found decision task", name=decision_task["metadata"]["name"])
# Match the decision task environment to get the mercurial information
decision_env = decision_task["payload"]["env"]
head_repository = base_repository = head_changeset = base_changeset = (
repository_try_name
) = None
for prefix in settings.decision_env_prefixes:
head_repository_key = f"{prefix}_HEAD_REPOSITORY"
base_repository_key = f"{prefix}_BASE_REPOSITORY"
head_changeset_key = f"{prefix}_HEAD_REV"
base_changeset_key = f"{prefix}_BASE_REV"
if (
head_repository_key not in decision_env
or base_repository_key not in decision_env
or head_changeset_key not in decision_env
or base_changeset_key not in decision_env
):
continue
head_repository = decision_env[head_repository_key]
base_repository = decision_env[base_repository_key]
head_changeset = decision_env[head_changeset_key]
base_changeset = decision_env[base_changeset_key]
repository_try_name = (
urllib.parse.urlparse(head_repository)
.path.rstrip("/")
.rsplit("/", 1)[-1]
)
break
# Check mercurial information were properly retrieved
assert all(
attr is not None
for attr in [
head_repository,
base_repository,
head_changeset,
base_changeset,
]
), "Unsupported parent decision task, missing mercurial information in its environment"
logger.info(
"Using mercurial changeset",
head_changeset=head_changeset,
head_repository=head_repository,
base_repository=base_repository,
)
# Build a revision without repositories as they are retrieved later
# when analyzing the full task group
return Revision(
phabricator_id=revision["id"],
phabricator_phid=phid,
diff_id=diff_id,
diff_phid=diff_phid,
build_target_phid=build_target_phid,
revision=revision,
phabricator_repository=repos["data"][0],
diff=diff,
url="https://{}/D{}".format(phabricator.hostname, revision["id"]),
patch=patch,
head_changeset=head_changeset,
base_changeset=base_changeset,
head_repository=head_repository,
repository_try_name=repository_try_name,
base_repository=base_repository,
)
@staticmethod
def from_decision_task(task: dict, phabricator: PhabricatorAPI):
"""
Build a revision from a Mozilla decision task (e.g. from Autoland or Mozilla-central).
No Phabricator reference nor diff is saved.
"""
# Load repositories
head_repository = task["payload"]["env"]["GECKO_HEAD_REPOSITORY"]
base_repository = task["payload"]["env"]["GECKO_BASE_REPOSITORY"]
assert head_repository in (
REPO_AUTOLAND,
REPO_MOZILLA_CENTRAL,
), "Decision task must be on autoland or mozilla-central"
# Load mercurial changesets
head_changeset = task["payload"]["env"]["GECKO_HEAD_REV"]
base_changeset = task["payload"]["env"]["GECKO_BASE_REV"]
return Revision(
head_changeset=head_changeset,
base_changeset=base_changeset,
head_repository=head_repository,
base_repository=base_repository,
)
@staticmethod
def from_phabricator_trigger(build_target_phid: str, phabricator: PhabricatorAPI):
assert build_target_phid.startswith("PHID-HMBT-")
# This is the very first call on Phabricator API for that build, so we need to retry
# a few times as the revision may not be immediately public
buildable = None
for i in range(5):
try:
buildable = phabricator.find_target_buildable(build_target_phid)
break
except Exception as e:
logger.info(
"Failed to load Harbormaster build on try {i+1}/5, will retry in 30 seconds",
error=str(e),
)
time.sleep(30)
if buildable is None:
raise Exception("Failed to load Habormaster build, no more tries left")
diff_phid = buildable["fields"]["objectPHID"]
assert diff_phid.startswith("PHID-DIFF-")
# Load diff details to get the diff revision
# We also load the commits list in order to get the email of the author of the
# patch for sending email if builds are failing.
diffs = phabricator.search_diffs(
diff_phid=diff_phid, attachments={"commits": True}
)
assert len(diffs) == 1, f"No diff available for {diff_phid}"
diff = diffs[0]
logger.info("Found diff", id=diff["id"], phid=diff["phid"])
revision_phid = diff["revisionPHID"]
# Load revision details from Phabricator
revision = phabricator.load_revision(revision_phid)
logger.info("Found revision", id=revision["id"], phid=revision["phid"])
# Lookup repository details and match with a known repo from configuration
repo_phid = revision["fields"]["repositoryPHID"]
repos = phabricator.request(
"diffusion.repository.search", constraints={"phids": [repo_phid]}
)
assert (
len(repos["data"]) == 1
), f"No repository found on Phabrictor for {repo_phid}"
phab_repo = repos["data"][0]
repo_name = phab_repo["fields"]["name"]
known_repos = {r.name: r for r in settings.repositories}
repository = known_repos.get(repo_name)
if repository is None:
raise Exception(
f"No repository found in configuration for {repo_name} - {repo_phid}"
)
logger.info("Found repository", name=repo_name, phid=repo_phid)
return Revision(
phabricator_id=revision["id"],
phabricator_phid=revision_phid,
diff_id=diff["id"],
diff_phid=diff["phid"],
diff=diff,
build_target_phid=build_target_phid,
url="https://{}/D{}".format(phabricator.hostname, revision["id"]),
revision=revision,
base_changeset="tip",
base_repository=repository.url,
base_repository_conf=repository,
repository_try_name=repository.try_name,
)
def analyze_patch(self):
"""
Analyze loaded patch to extract modified lines
and statistics
"""
assert self.patch is not None, "Missing patch"
assert isinstance(self.patch, str), "Invalid patch type"
# List all modified lines from current revision changes
patch_stats = rs_parsepatch.get_lines(self.patch)
assert len(patch_stats) > 0, "Empty patch"
self.lines = {stat["filename"]: stat["added_lines"] for stat in patch_stats}
# Shortcut to files modified
self.files = self.lines.keys()
# Report nb of files and lines analyzed
stats.add_metric("analysis.files", len(self.files))
stats.add_metric(
"analysis.lines", sum(len(line) for line in self.lines.values())
)
def load_file(self, path):
"""
Load a file content at current revision from remote HGMO
"""
# Check in hgmo cache first
cache_path = os.path.join(settings.hgmo_cache, path)
if Path(settings.hgmo_cache) not in Path(cache_path).resolve().parents:
logger.info("Element is not valid for caching, skipping", path=path)
return
if os.path.exists(cache_path):
with open(cache_path) as f:
return f.read()
# Retrieve remote file
url = urllib.parse.urljoin(
"https://hg.mozilla.org",
f"{self.head_repository}/raw-file/{self.head_changeset}/{path}",
)
logger.info("Downloading HGMO file", url=url)
response = requests.get(url, headers=GetAppUserAgent())
response.raise_for_status()
# Store in cache
content = response.content.decode("utf-8")
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "w") as f:
f.write(content)
return content
def has_file(self, path):
"""
Check if the path is in this patch
"""
assert isinstance(path, str)
return path in self.files
def contains(self, issue):
"""
Check if the issue (path+lines) is in this patch
"""
assert isinstance(issue, Issue)
# Get modified lines for this issue
modified_lines = self.lines.get(issue.path)
if modified_lines is None:
return False
# Empty line means full file
if issue.line is None:
return True
# Detect if this issue is in the patch
lines = set(range(issue.line, issue.line + issue.nb_lines))
return not lines.isdisjoint(modified_lines)
@property
def has_clang_files(self):
"""
Check if this revision has any file that might
be a C/C++ file
"""
def _is_clang(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in settings.cpp_extensions
return any(_is_clang(f) for f in self.files)
@property
def has_clang_header_files(self):
"""
Check if this revision has any file that might
be a C/C++ header file
"""
def _is_clang_header(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in settings.cpp_header_extensions
return any(_is_clang_header(f) for f in self.files)
@property
def has_idl_files(self):
"""
Check if this revision has any idl files
"""
def _is_idl(filename):
_, ext = os.path.splitext(filename)
return ext.lower() in settings.idl_extensions
return any(_is_idl(f) for f in self.files)
@property
def is_blacklisted(self):
"""Check if the revision author is in the black-list"""
author = settings.user_blacklist.get(self.revision["fields"]["authorPHID"])
if author is None:
return False
logger.info("Revision from a blacklisted user", revision=self, author=author)
return True
def add_improvement_patch(self, analyzer, content):
"""
Save an improvement patch, and make it available
as a Taskcluster artifact
"""
assert isinstance(content, str)
assert len(content) > 0
self.improvement_patches.append(ImprovementPatch(analyzer, repr(self), content))
def reset(self):
"""
Reset temporary data in BEFORE mode
* improvement patches
"""
self.improvement_patches = []
@property
def bugzilla_id(self):
if self.revision is None:
return None
try:
return int(self.revision["fields"].get("bugzilla.bug-id"))
except (TypeError, ValueError):
logger.info("No bugzilla id available for this revision")
return None
@property
def title(self):
if self.revision:
return self.revision["fields"].get("title")
if self.head_changeset is None:
return None
title = f"Changeset {self.head_changeset[:12]}"
if self.head_repository:
title += f" ({self.head_repository})"
return title
def as_dict(self):
"""
Outputs a serializable representation of this revision
"""
return {
"diff_phid": self.diff_phid,
"phid": self.phabricator_phid,
"diff_id": self.diff_id,
"id": self.phabricator_id,
"url": self.url,
"has_clang_files": self.has_clang_files,
# Extra infos for frontend
"title": self.title,
"bugzilla_id": self.bugzilla_id,
# Extra infos for backend
"repository": self.head_repository,
"target_repository": self.base_repository,
"mercurial_revision": self.head_changeset,
# New names that should be used instead of the old
# repository, target_repository and mercurial_revision ones
"head_repository": self.head_repository,
"base_repository": self.base_repository,
"head_changeset": self.head_changeset,
"base_changeset": self.base_changeset,
}