client/scripts/verify-mo.py (90 lines of code) (raw):

#!/usr/bin/env python3 """ Verify the reproducibility of gettext machine objects (.mo) from catalogs (.po). Due to tool- and library-level idiosyncrasies, this happens in three stages: 1. Via polib: Overwrite metadata .mo → .po. 2. Via translate: Recompile the entire catalog .po → .mo. 3. Via diffoscope: Diff the new .mo against the old, heavily masked and filtered to avoid false positives from stray entries in the "fuzzy" and "obsolete" states. In other words, the new .mo file should be identical (modulo stray entries) to the original, meaning that the original .po/.mo pair differed only in their metadata. """ import argparse import os import shlex import subprocess from collections.abc import Iterator from pathlib import Path import polib from translate.tools.pocompile import convertmo parser = argparse.ArgumentParser( """Verify the reproducibility of gettext machine objects (.mo) from catalogs (.po).""" ) parser.add_argument( "locale", nargs="+", help="""one or more locale directories, each of which must contain an "LC_MESSAGES" directory""", ) parser.add_argument( "--domain", default="messages", help="""the gettext domain to load (defaults to "messages")""" ) args = parser.parse_args() class CatalogVerifier: """Wrapper class for proving .mo → .po → .mo reproducibility.""" def __init__(self, path: Path, domain: str): """Set up the .po/.mo pair.""" self.path = path self.po = polib.pofile(str(path / "LC_MESSAGES" / f"{domain}.po")) self.mo = polib.mofile(str(path / "LC_MESSAGES" / f"{domain}.mo")) def __enter__(self) -> "CatalogVerifier": """Prepare to generate the new .mo file to diff.""" self.mo_target = Path(f"{self.mo.fpath}.new") return self def __exit__( self, exc_type: object, exc_value: object, traceback: object, ) -> None: """Clean up.""" self.mo_target.unlink(missing_ok=True) @property def strays(self) -> set[str]: """Return the set of stray (fuzzy or obsolete) entries to mask when diffing this catalog.""" fuzzy = { f"^{line.replace('#| ', '')}" # strip fuzzy marker for e in self.po.fuzzy_entries() for line in str(e).splitlines() } obsolete = { f"^{line.replace('#~ ', '')}" # strip obsolete marker for e in self.po.obsolete_entries() for line in str(e).splitlines() } return fuzzy | obsolete def diffoscope_args(self, a: Path, b: Path, filtered: bool = True) -> Iterator[str]: """Build up a diffoscope invocation that (with `filtered`) removes false positives from the msgunfmt diff.""" yield f"diffoscope {a} {b}" if not filtered: return yield "--diff-mask '^$'" # tell diffoscope to mask empty lines for stray in self.strays: yield f"--diff-mask {shlex.quote(stray)}" # tell diffoscope to mask strays yield "| grep -Fv '[masked]'" # ignore things we've masked yield "| grep -E '│ (-|\\+)msg(id|str)'" # ignore context; we only care about real diffs def diffoscope_call( self, a: Path, b: Path, filtered: bool = True ) -> subprocess.CompletedProcess: """Call diffoscope and return the subprocess.CompletedProcess result for further processing, *without* first checking whether it was succesful.""" cmd = " ".join(self.diffoscope_args(a, b, filtered)) # We silence Bandit and Semgrep warnings on `shell=True` # because we want to inherit the Python virtual environment # in which we're invoked. return subprocess.run( # noqa: S602 cmd, capture_output=True, env=os.environ, # nosemgrep: python.lang.security.audit.subprocess-shell-true.subprocess-shell-true shell=True, check=False, ) def reproduce(self) -> None: """Overwrite metadata .mo → .po. Then rewrite the entire file .po → .mo.""" self.po.metadata = self.mo.metadata self.po.save(self.po.fpath) with open(self.mo_target, "wb") as mo_target: convertmo(self.po.fpath, mo_target, "") def verify(self) -> None: """Run diffoscope for this catalog and error if there's any unmasked diff.""" # Without filtering, diffoscope should return either 0 (no differences) # or 1 (differences); anything else is an error. test = self.diffoscope_call(Path(self.mo.fpath), Path(self.mo_target), filtered=False) if test.returncode not in [0, 1]: test.check_returncode() # With filtering, since diffoscope will return 1 on differences # (pre-filtering), and grep will return 1 on *no* differences # (post-filtering), we can't count on result.returncode here. result = self.diffoscope_call(Path(self.mo.fpath), Path(self.mo_target)) print(f"--> Verifying {self.path}: {result.args}") if len(result.stdout) > 0: raise Exception(result.stdout.decode("utf-8")) print(f"--> Reproducing {len(args.locale)} path(s)") for path in args.locale: locale_dir = Path(path).resolve() if not locale_dir.is_dir(): print(f'--> Skipping "{locale_dir}"') continue with CatalogVerifier(locale_dir, args.domain) as catalog: catalog.reproduce() catalog.verify()