sync/base.py

from __future__ import annotations import json import sys import weakref from collections import defaultdict from collections.abc import Mapping import git import pygit2 from . import log from . import commit as sync_commit from .env import Environment from .lock import MutGuard, RepoLock, mut, constructor from .repos import pygit2_get from typing import (Any, DefaultDict, Iterator, Optional, Set, Tuple, TYPE_CHECKING) from git.refs.reference import Reference from git.repo.base import Repo if TYPE_CHECKING: from pygit2.repository import Repository from pygit2 import Commit as PyGit2Commit, TreeEntry from sync.commit import Commit from sync.landing import LandingSync from sync.lock import SyncLock from sync.sync import SyncPointName ProcessNameIndexData = DefaultDict[str, DefaultDict[str, DefaultDict[str, Set]]] ProcessNameKey = Tuple[str, str, str, str] env = Environment() logger = log.get_logger(__name__) class IdentityMap(type): """Metaclass for objects that implement an identity map. Identity mapped objects return the same object when created with the same input data, so that there can only be a single object with given properties active at a time. Typically one would expect the identity in the identity map to be determined by the full set of arguments to the constructor. However we have various situations where either global singletons are passed in via the constructor (e.g. the repos), or associated class data (notably the commit type). To avoid refactoring everything when introducing this feature, we instead define the following protocol: A class implementing this metaclass must define a _cache_key method that is called with the constructor arguments. It returns some hashable object that is used as a key in the instance cache. If an existing instance of the same class exists with the same key that is returned, otherwise a new instance is constructed and cached. A class may also define a _cache_verify property. If this method exists it is called after an instance is retrieved from the cache and may be used to check that any arguments that are not part of the constructor key are consistent between the provided values and the instance values. This is clearly a hack; in general the code should be refactored so that such associated values are determined based on data that forms part of the instance key rather than passed in explicitly.""" _cache: weakref.WeakValueDictionary = weakref.WeakValueDictionary() def __init__(cls, name, bases, cls_dict): if not hasattr(cls, "_cache_key"): raise ValueError("Class is missing _cache_key method") super().__init__(name, bases, cls_dict) def __call__(cls, *args, **kwargs): cache = IdentityMap._cache cache_key = cls._cache_key(*args, **kwargs) if cache_key is None: raise ValueError key = (cls, cache_key) value = cache.get(key) if value is None: value = super().__call__(*args, **kwargs) cache[key] = value if hasattr(value, "_cache_verify") and not value._cache_verify(*args, **kwargs): raise ValueError("Cached instance didn't match non-key arguments") return value def iter_tree(pygit2_repo: Repository, root_path: str = "", rev: PyGit2Commit | None = None, ) -> Iterator[tuple[tuple[str, ...], TreeEntry]]: """Iterator over all paths in a tree""" if rev is None: ref_name = env.config["sync"]["ref"] ref = pygit2_repo.references[ref_name] rev_obj = ref.peel() else: rev_obj = pygit2_repo[rev.id] assert isinstance(rev_obj, pygit2.Commit) root_obj = rev_obj.tree if root_path: root_tree = root_obj[root_path] else: root_tree = root_obj stack = [] stack.append((root_path, root_tree)) while stack: path, tree = stack.pop() assert isinstance(tree, pygit2.Tree) for item in tree: item_path = f"{path}/{item.name}" if isinstance(item, pygit2.Tree): stack.append((item_path, item)) else: name = tuple(item for item in item_path[len(root_path):].split("/") if item) yield name, item def iter_process_names(pygit2_repo: Repository, kind: list[str] = ["sync", "try"], ) -> Iterator[ProcessName]: """Iterator over all ProcessName objects""" ref = pygit2_repo.references[env.config["sync"]["ref"]] root = ref.peel().tree stack = [] for root_path in kind: try: tree = root[root_path] except KeyError: continue stack.append((root_path, tree)) while stack: path, tree = stack.pop() for item in tree: item_path = f"{path}/{item.name}" if isinstance(item, pygit2.Tree): stack.append((item_path, item)) else: process_name = ProcessName.from_path(item_path) if process_name is not None: yield process_name class ProcessNameIndex(metaclass=IdentityMap): def __init__(self, repo: Repo) -> None: self.repo = repo self.pygit2_repo = pygit2_get(repo) self.reset() @classmethod def _cache_key(cls, repo: Repo) -> tuple[Repo]: return (repo,) def reset(self) -> None: self._all: set[ProcessName] = set() self._data: ProcessNameIndexData = defaultdict( lambda: defaultdict( lambda: defaultdict(set))) self._built = False def build(self) -> None: for process_name in iter_process_names(self.pygit2_repo): self.insert(process_name) self._built = True def insert(self, process_name: ProcessName) -> None: self._all.add(process_name) self._data[ process_name.obj_type][ process_name.subtype][ process_name.obj_id].add(process_name) def has(self, process_name: ProcessName) -> bool: if not self._built: self.build() return process_name in self._all def get(self, obj_type: str, subtype: str | None = None, obj_id: str | None = None) -> set[ProcessName]: if not self._built: self.build() target = self._data for key in [obj_type, subtype, obj_id]: assert isinstance(key, str) if key is None: break target = target[key] # type: ignore rv: set[ProcessName] = set() stack = [target] while stack: item = stack.pop() if isinstance(item, set): rv |= item else: stack.extend(item.values()) # type: ignore return rv class ProcessName(metaclass=IdentityMap): """Representation of a name that is used to identify a sync operation. This has the general form <obj type>/<subtype>/<obj_id>[/<seq_id>]. Here <obj type> represents the type of process e.g upstream or downstream, <obj_id> is an identifier for the sync, typically either a bug number or PR number, and <seq_id> is an optional id to cover cases where we might have multiple processes with the same obj_id. """ def __init__(self, obj_type: str, subtype: str, obj_id: str, seq_id: str | int) -> None: assert obj_type is not None assert subtype is not None assert obj_id is not None assert seq_id is not None self._obj_type = obj_type self._subtype = subtype self._obj_id = str(obj_id) self._seq_id = str(seq_id) @classmethod def _cache_key(cls, obj_type: str, subtype: str, obj_id: str, seq_id: str | int, ) -> tuple[str, str, str, str]: return (obj_type, subtype, str(obj_id), str(seq_id)) def __str__(self) -> str: data = "%s/%s/%s/%s" % self.as_tuple() if sys.version_info[0] == 2: data = data.encode("utf8") return data def key(self) -> tuple[str, str, str, str]: return self._cache_key(self._obj_type, self._subtype, self._obj_id, self._seq_id) def path(self) -> str: return "%s/%s/%s/%s" % self.as_tuple() def __eq__(self, other: Any) -> bool: if self is other: return True if self.__class__ != other.__class__: return False return self.as_tuple() == other.as_tuple() def __hash__(self) -> int: return hash(self.key()) @property def obj_type(self) -> str: return self._obj_type @property def subtype(self) -> str: return self._subtype @property def obj_id(self) -> str: return self._obj_id @property def seq_id(self) -> int: return int(self._seq_id) def as_tuple(self) -> tuple[str, str, str, int]: return (self.obj_type, self.subtype, self.obj_id, self.seq_id) @classmethod def from_path(cls, path: str) -> ProcessName | None: return cls.from_tuple(path.split("/")) @classmethod def from_tuple(cls, parts: list[str]) -> ProcessName | None: if parts[0] not in ["sync", "try"]: return None if len(parts) != 4: return None return cls(*parts) @classmethod def with_seq_id(cls, repo: Repo, obj_type: str, subtype: str, obj_id: str) -> ProcessName: existing = ProcessNameIndex(repo).get(obj_type, subtype, obj_id) last_id = -1 for process_name in existing: if (process_name.seq_id is not None and int(process_name.seq_id) > last_id): last_id = process_name.seq_id seq_id = last_id + 1 return cls(obj_type, subtype, obj_id, str(seq_id)) class VcsRefObject(metaclass=IdentityMap): """Representation of a named reference to a git object associated with a specific process_name. This is typically either a tag or a head (i.e. branch), but can be any git object.""" ref_prefix: str | None = None def __init__(self, repo: Repo, name: ProcessName | SyncPointName, commit_cls: type = sync_commit.Commit) -> None: self.repo = repo self.pygit2_repo = pygit2_get(repo) if not self.get_path(name) in self.pygit2_repo.references: raise ValueError("No ref found in %s with path %s" % (repo.working_dir, self.get_path(name))) self.name = name self.commit_cls = commit_cls self._lock = None def as_mut(self, lock: SyncLock) -> MutGuard: return MutGuard(lock, self) @property def lock_key(self) -> tuple[str, str]: return (self.name.subtype, self.name.obj_id) @classmethod def _cache_key(cls, repo: Repo, process_name: ProcessName | SyncPointName, commit_cls: type = sync_commit.Commit, ) -> tuple[Repo, ProcessNameKey | tuple[str, str]]: return (repo, process_name.key()) def _cache_verify(self, repo: Repo, process_name: ProcessName | SyncPointName, commit_cls: type = sync_commit.Commit) -> bool: return commit_cls == self.commit_cls @classmethod @constructor(lambda args: (args["name"].subtype, args["name"].obj_id)) def create(cls, lock: SyncLock, repo: Repo, name: ProcessName, obj: str, commit_cls: type = sync_commit.Commit, force: bool = False) -> VcsRefObject: path = cls.get_path(name) logger.debug("Creating ref %s" % path) pygit2_repo = pygit2_get(repo) if path in pygit2_repo.references: if not force: raise ValueError(f"Ref {path} exists") pygit2_repo.references.create(path, pygit2_repo.revparse_single(obj).id, force=force) return cls(repo, name, commit_cls) def __str__(self) -> str: return self.path def delete(self) -> None: self.pygit2_repo.references[self.path].delete() @classmethod def get_path(cls, name: ProcessName | SyncPointName) -> str: return f"refs/{cls.ref_prefix}/{name.path()}" @property def path(self) -> str: return self.get_path(self.name) @property def ref(self) -> Reference | None: if self.path in self.pygit2_repo.references: return git.Reference(self.repo, self.path) return None @property def commit(self) -> Commit | None: ref = self.ref if ref is not None: commit = self.commit_cls(self.repo, ref.commit) return commit return None @commit.setter # type: ignore @mut() def commit(self, commit: Commit | str) -> None: if isinstance(commit, sync_commit.Commit): sha1 = commit.sha1 else: sha1 = commit sha1 = self.pygit2_repo.revparse_single(sha1).id self.pygit2_repo.references[self.path].set_target(sha1) class BranchRefObject(VcsRefObject): ref_prefix = "heads" class CommitBuilder: def __init__(self, repo: Repo, message: str, ref: str | None = None, commit_cls: type = sync_commit.Commit, initial_empty: bool = False ) -> None: """Object to be used as a context manager for commiting changes to the repo. This class provides low-level access to the git repository in order to make commits without requiring a checkout. It also enforces locking so that only one process may make a commit at a time. In order to use the object, one initalises it and then invokes it as a context manager e.g. with CommitBuilder(repo, "Some commit message" ref=ref) as commit_builder: # Now we have acquired the lock so that the commit ref points to is fixed commit_builder.add_tree({"some/path": "Some file data"}) commit_bulder.delete(["another/path"]) # On exiting the context, the commit is created, the ref updated to point at the # new commit and the lock released # To get the created commit we call get commit = commit_builder.get() The class may be used reentrantly. This is to support a pattern where a method may be called either with an existing commit_builder instance or create a new instance, and in either case use a with block. In order to improve the performance of the low-level access here, we use libgit2 to access the repository. """ # Class state self.repo = repo self.pygit2_repo = pygit2_get(repo) self.message = message if message is not None else "" self.commit_cls = commit_cls self.initial_empty = initial_empty if not ref: self.ref = None else: self.ref = ref self._count = 0 # State set for the life of the context manager self.lock = RepoLock(repo) self.parents: list[str] | None = None self.commit = None self.index: pygit2.Index = None self.has_changes = False def __enter__(self) -> CommitBuilder: self._count += 1 if self._count != 1: return self self.lock.__enter__() # First we create an empty index self.index = pygit2.Index() if self.ref is not None: try: ref = self.pygit2_repo.lookup_reference(self.ref) except KeyError: self.parents = [] else: self.parents = [ref.peel().id] if not self.initial_empty: self.index.read_tree(ref.peel().tree) else: self.parents = [] return self def __exit__(self, *args: Any, **kwargs: Any) -> None: self._count -= 1 if self._count != 0: return if not self.has_changes: if self.parents: sha1 = self.parents[0] else: return None else: tree_id = self.index.write_tree(self.pygit2_repo) sha1 = self.pygit2_repo.create_commit(self.ref, self.pygit2_repo.default_signature, self.pygit2_repo.default_signature, self.message.encode("utf8"), tree_id, self.parents) self.lock.__exit__(*args, **kwargs) self.commit = self.commit_cls(self.repo, sha1) def add_tree(self, tree: dict[str, bytes]) -> None: self.has_changes = True for path, data in tree.items(): blob = self.pygit2_repo.create_blob(data) index_entry = pygit2.IndexEntry(path, blob, pygit2.GIT_FILEMODE_BLOB) self.index.add(index_entry) def delete(self, delete: list[str]) -> None: self.has_changes = True if delete: for path in delete: self.index.remove(path) def get(self) -> Any | None: return self.commit class ProcessData(metaclass=IdentityMap): obj_type: str = "" def __init__(self, repo: Repo, process_name: ProcessName) -> None: assert process_name.obj_type == self.obj_type self.repo = repo self.pygit2_repo = pygit2_get(repo) self.process_name = process_name self.ref = git.Reference(repo, env.config["sync"]["ref"]) self.path = self.get_path(process_name) self._data = self._load() self._lock = None self._updated: set[str] = set() self._deleted: set[str] = set() self._delete = False def __repr__(self) -> str: return f"<{self.__class__.__name__} {self.process_name}>" def __hash__(self) -> int: return hash(self.process_name) def __eq__(self, other: Any) -> bool: if type(self) is not type(other): return False return self.repo == other.repo and self.process_name == other.process_name def as_mut(self, lock: SyncLock) -> MutGuard: return MutGuard(lock, self) def exit_mut(self) -> None: message = "Update %s\n\n" % self.path with CommitBuilder(self.repo, message=message, ref=self.ref.path) as commit: from . import index if self._delete: self._delete_data("Delete %s" % self.path) self._delete = False elif self._updated or self._deleted: message_parts = [] if self._updated: message_parts.append("Updated: {}\n".format(", ".join(self._updated))) if self._deleted: message_parts.append("Deleted: {}\n".format(", ".join(self._deleted))) self._save(self._data, message=" ".join(message_parts), commit_builder=commit) self._updated = set() self._deleted = set() for idx_cls in index.indicies: idx = idx_cls(self.repo) idx.save(commit_builder=commit) @classmethod @constructor(lambda args: (args["process_name"].subtype, args["process_name"].obj_id)) def create(cls, lock: SyncLock, repo: Repo, process_name: ProcessName, data: dict[str, Any], message: str = "Sync data", commit_builder: Optional[CommitBuilder] = None ) -> ProcessData: assert process_name.obj_type == cls.obj_type path = cls.get_path(process_name) ref = git.Reference(repo, env.config["sync"]["ref"]) try: ref.commit.tree[path] except KeyError: pass else: raise ValueError(f"{cls.__name__} already exists at path {path}") if commit_builder is None: commit_builder = CommitBuilder(repo, message, ref=ref.path) else: assert commit_builder.ref == ref.path with commit_builder as commit: commit.add_tree({path: json.dumps(data).encode("utf8")}) ProcessNameIndex(repo).insert(process_name) return cls(repo, process_name) @classmethod def _cache_key(cls, repo: Repo, process_name: ProcessName) -> tuple[Repo, ProcessNameKey]: return (repo, process_name.key()) @classmethod def get_path(self, process_name: ProcessName) -> str: return process_name.path() @classmethod def load_by_obj(cls, repo: Repo, subtype: str, obj_id: int, seq_id=None # Type: Optional[int] ) -> set[ProcessData]: process_names = ProcessNameIndex(repo).get(cls.obj_type, subtype, str(obj_id)) if seq_id is not None: process_names = {item for item in process_names if item.seq_id == seq_id} return {cls(repo, process_name) for process_name in process_names} @classmethod def load_by_status(cls, repo, subtype, status): from . import index process_names = index.SyncIndex(repo).get((cls.obj_type, subtype, status)) rv = set() for process_name in process_names: rv.add(cls(repo, process_name)) return rv def _save(self, data: dict[str, Any], message: str, commit_builder: CommitBuilder | None = None) -> Any | None: if commit_builder is None: commit_builder = CommitBuilder(self.repo, message=message, ref=self.ref.path) else: commit_builder.message += message tree = {self.path: json.dumps(data).encode("utf8")} with commit_builder as commit: commit.add_tree(tree) return commit.get() def _delete_data(self, message: str, commit_builder: CommitBuilder | None = None) -> None: if commit_builder is None: commit_builder = CommitBuilder(self.repo, message=message, ref=self.ref.path) with commit_builder as commit: commit.delete([self.path]) def _load(self) -> dict[str, Any]: ref = self.pygit2_repo.references[self.ref.path] repo = self.pygit2_repo try: data = repo[repo[ref.peel().tree.id][self.path].id].data except KeyError: return {} return json.loads(data) @property def lock_key(self) -> tuple[str, str]: return (self.process_name.subtype, self.process_name.obj_id) def __getitem__(self, key: str) -> Any: return self._data[key] def __contains__(self, key: str) -> bool: return key in self._data def get(self, key: str, default: Any = None) -> Any: return self._data.get(key, default) def items(self) -> Iterator[tuple[str, Any]]: yield from self._data.items() @mut() def __setitem__(self, key: str, value: Any) -> None: if key not in self._data or self._data[key] != value: self._data[key] = value self._updated.add(key) @mut() def __delitem__(self, key: str) -> None: if key in self._data: del self._data[key] self._deleted.add(key) @mut() def delete(self) -> None: self._delete = True class FrozenDict(Mapping): def __init__(self, **kwargs: Any) -> None: self._data = {} for key, value in kwargs.items(): self._data[key] = value def __getitem__(self, key: str) -> Any: return self._data[key] def __contains__(self, key: Any) -> bool: return key in self._data def copy(self, **kwargs: Any) -> FrozenDict: new_data = self._data.copy() for key, value in kwargs.items(): new_data[key] = value return self.__class__(**new_data) def __iter__(self) -> Iterator[str]: yield from self._data def __len__(self) -> int: return len(self._data) def as_dict(self) -> dict[str, Any]: return self._data.copy() class entry_point: def __init__(self, task): self.task = task def __call__(self, f): def inner(*args: Any, **kwargs: Any) -> LandingSync | None: logger.info(f"Called entry point {f.__module__}.{f.__name__}") logger.debug(f"Called args {args!r} kwargs {kwargs!r}") if self.task in env.config["sync"]["enabled"]: return f(*args, **kwargs) logger.debug("Skipping disabled task %s" % self.task) return None inner.__name__ = f.__name__ inner.__doc__ = f.__doc__ return inner

sync/base.py (554 lines of code) (raw):