scripts/asciicheck.py

#!/usr/bin/env python3 import argparse import sys from pathlib import Path """ Utility script that takes a list of files and returns non-zero if any of them contain non-ASCII characters other than those in the allowed list. If --fix is used, it will attempt to replace non-ASCII characters with ASCII equivalents. The motivation behind this script is that characters like U+00A0 (non-breaking space) can cause regexes not to match and can result in surprising anchor values for headings when GitHub renders Markdown as HTML. """ """ When --fix is used, perform the following substitutions. """ substitutions: dict[int, str] = { 0x00A0: " ", # non-breaking space 0x2011: "-", # non-breaking hyphen 0x2013: "-", # en dash 0x2014: "-", # em dash 0x2018: "'", # left single quote 0x2019: "'", # right single quote 0x201C: '"', # left double quote 0x201D: '"', # right double quote 0x2026: "...", # ellipsis 0x202F: " ", # narrow non-breaking space } """ Unicode codepoints that are allowed in addition to ASCII. Be conservative with this list. Note that it is always an option to use the hex HTML representation instead of the character itself so the source code is ASCII-only. For example, U+2728 (sparkles) can be written as `✨`. """ allowed_unicode_codepoints = { 0x2728, # sparkles } def main() -> int: parser = argparse.ArgumentParser( description="Check for non-ASCII characters in files." ) parser.add_argument( "--fix", action="store_true", help="Rewrite files, replacing non-ASCII characters with ASCII equivalents, where possible.", ) parser.add_argument( "files", nargs="+", help="Files to check for non-ASCII characters.", ) args = parser.parse_args() has_errors = False for filename in args.files: path = Path(filename) has_errors |= lint_utf8_ascii(path, fix=args.fix) return 1 if has_errors else 0 def lint_utf8_ascii(filename: Path, fix: bool) -> bool: """Returns True if an error was printed.""" try: with open(filename, "rb") as f: raw = f.read() text = raw.decode("utf-8") except UnicodeDecodeError as e: print("UTF-8 decoding error:") print(f" byte offset: {e.start}") print(f" reason: {e.reason}") # Attempt to find line/column partial = raw[: e.start] line = partial.count(b"\n") + 1 col = e.start - (partial.rfind(b"\n") if b"\n" in partial else -1) print(f" location: line {line}, column {col}") return True errors = [] for lineno, line in enumerate(text.splitlines(keepends=True), 1): for colno, char in enumerate(line, 1): codepoint = ord(char) if char == "\n": continue if ( not (0x20 <= codepoint <= 0x7E) and codepoint not in allowed_unicode_codepoints ): errors.append((lineno, colno, char, codepoint)) if errors: for lineno, colno, char, codepoint in errors: safe_char = repr(char)[1:-1] # nicely escape things like \u202f print( f"Invalid character at line {lineno}, column {colno}: U+{codepoint:04X} ({safe_char})" ) if errors and fix: print(f"Attempting to fix {filename}...") num_replacements = 0 new_contents = "" for char in text: codepoint = ord(char) if codepoint in substitutions: num_replacements += 1 new_contents += substitutions[codepoint] else: new_contents += char with open(filename, "w", encoding="utf-8") as f: f.write(new_contents) print(f"Fixed {num_replacements} of {len(errors)} errors in {filename}.") return bool(errors) if __name__ == "__main__": sys.exit(main())

scripts/asciicheck.py (86 lines of code) (raw):