asfyaml/dataobjects.py (146 lines of code) (raw):
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pathlib
import asfyaml.mappings as mappings
import os
import subprocess
DEFAULT_BRANCH = "refs/heads/main"
UNKNOWN_BRANCH = "--unknown-branch--"
GIT_CMD = "/usr/bin/git"
# COMMIT_FIELDS represent the data points we collect from commits when iterating over the change-sets
COMMIT_FIELDS = [
("commit", "%h"),
("parents", "%p"),
("tree", "%t"),
("author", "%aN <%ae>"),
("authored", "%ad"),
("author_name", "%aN"),
("author_email", "%ae"),
("committer", "%cN <%ce>"),
("committer_email", "%ce"),
("committed", "%cd"),
("committed_unix", "%ct"),
("ref_names", "%d"),
("subject", "%s"),
("body", "%B"),
]
def gitcmd(*args):
"""Runs a git command and returns the output as a string"""
xargs = list(args)
xargs.insert(0, GIT_CMD)
try:
rv = subprocess.check_output(xargs, stderr=subprocess.PIPE, universal_newlines=True)
except subprocess.CalledProcessError as e:
print(e.stderr)
rv = ""
return rv
class Committer:
""" "Simple info class for committer(pusher) of code"""
def __init__(self, username):
#: str: The ASF user id of the person that pushed this commit, for instance :samp:`humbedooh`
self.username = username
#: str: The ASF email address of the person that pushed this commit, for instance :samp:`humbedooh@apache.org`.
self.email = f"{username}@apache.org"
class Commit:
def __init__(self, ref, sha):
self.ref = ref
self.sha = sha
fmt = "--format=format:%s%%x00" % r"%x00".join([s for _, s in COMMIT_FIELDS])
args = ["show", "--stat=75", fmt, self.sha]
parts = gitcmd(*args).split("\x00")
self.stats = "\n".join(filter(None, parts.pop(-1).splitlines()))
for pos, (key, _) in enumerate(COMMIT_FIELDS):
setattr(self, key, parts[pos])
self.committed_unix = int(self.committed_unix)
parts = self.committer_email.split("@")
self.committer_uname = parts[0]
if len(parts) > 1:
self.committer_domain = parts[1]
else:
self.committer_domain = ""
def __cmp__(self, other):
return self.committed_unix == other.committed_unix
@property
def is_merge(self):
return len(self.parents.split()) > 1
@property
def files(self):
files = gitcmd("show", "--name-only", "--format=format:", self.sha)
return [line.strip() for line in files.splitlines() if line.strip()]
def diff(self, fname):
args = ["show", "--format=format:", self.sha, "--", fname]
return gitcmd(*args).lstrip()
class ChangeSet:
def __init__(self, name, oldsha, newsha):
self.name = name
self.oldsha = oldsha
self.newsha = newsha
@property
def created(self):
"""Returns True if this tag or branch was just created"""
return self.oldsha == ("0" * 40)
@property
def deleted(self):
"""Returns True if this branch or tag was deleted"""
return self.newsha == ("0" * 40)
@property
def is_tag(self):
"""Returns True if this ref is a tag rather than a branch"""
return self.name.startswith("refs/tags/")
@property
def is_branch(self):
"""Returns True if this ref is a branch rather than a tag"""
return self.name.startswith("refs/heads/")
@property
def is_rewrite(self):
"""Returns true if this is a history rewrite"""
return self.merge_base != self.oldsha
@property
def commits(self, num=None, reverse=False):
"""Lists all commits in this ref update as Commit objects"""
# Deleted refs have no commits.
if self.deleted:
return
# Only report commits that aren't reachable from any other branch
refs = []
args = ["for-each-ref", "--format=%(refname)"]
for r in gitcmd(*args).splitlines():
if r.strip() == self.name:
continue
if r.strip().startswith("refs/heads/"):
refs.append("^%s" % r.strip())
args = ["rev-list"]
if num is not None:
args += ["-n", str(num)]
if reverse:
args.append("--reverse")
if self.created:
args += refs
args.append(self.newsha)
else:
args.append("%s..%s" % (self.oldsha, self.newsha))
for line in gitcmd(*args).splitlines():
sha = line.strip()
yield Commit(self, sha)
@property
def merge_base(self):
"""finds the best common ancestor(s) between two commits to use in a three-way merge."""
if ("0" * 40) in (self.oldsha, self.newsha):
return "0" * 40
sha = gitcmd("merge-base", self.oldsha, self.newsha)
return sha.strip()
class Repository:
"""Simple class that holds information about the repository (and branch) being processed.
:parameter path: The filesystem path to the .git directory for this repository.
Example::
import dataobjects
repo = dataobjects.Repository("/x1/repos/asf/tomcat/tomcat9.git")
assert repo.is_private is False, "This should not be a private repo!"
website = f"https://{repo.hostname}.apache.org/"
"""
def __init__(self, path, reflog="", org_id: str = "apache"):
#: str|Pathlike: The filesystem path to this repository directory
self.path = pathlib.Path(path)
#: str: The name of this repository (sans the .git part), for instance :samp:`whimsy-website`.
self.name = self.path.name.removesuffix(".git")
#: str: Ref update log, if found. Follows standard git syntax, one entry per line
# with: "oldsha newsha refname". Used for populating self.changesets
self._reflog = reflog or ""
#: str: The GitHub organization this repository belongs to, by default `apache`.
self.org_id = org_id
@property
def is_private(self):
""" "Set to True if the repository is a private repository, False if it is public"""
return "private" in self.path.parts
@property
def project(self):
"""Returns the LDAP name of the project owning this repository, for instance httpd or openoffice"""
match = mappings.REPO_RE.match(self.name)
if match:
return match.group(1)
return "infrastructure" # Weird repo name, default to infra owning it.
@property
def hostname(self):
"""Returns the hostname for the project. httpd for httpd, but whimsical for whimsy."""
return mappings.LDAP_TO_HOSTNAME.get(self.project, self.project)
@property
def default_branch(self):
"""Returns the default branch for this repository."""
head_path = os.path.join(self.path, "HEAD")
if os.path.isfile(head_path):
hb = open(head_path).read().removeprefix("ref: refs/heads/").strip()
else:
hb = DEFAULT_BRANCH.removeprefix("refs/heads/")
return hb
@property
def changesets(self):
"""Yields a ChangeSet for each ref update seen in this push.
Each ChangeSet can have several commits bundled"""
for line in self._reflog.splitlines():
oldsha, newsha, name = line.split(None, 2)
yield ChangeSet(name.strip(), oldsha, newsha)