in scripts/asciicheck.py [0:0]
def lint_utf8_ascii(filename: Path, fix: bool) -> bool:
"""Returns True if an error was printed."""
try:
with open(filename, "rb") as f:
raw = f.read()
text = raw.decode("utf-8")
except UnicodeDecodeError as e:
print("UTF-8 decoding error:")
print(f" byte offset: {e.start}")
print(f" reason: {e.reason}")
# Attempt to find line/column
partial = raw[: e.start]
line = partial.count(b"\n") + 1
col = e.start - (partial.rfind(b"\n") if b"\n" in partial else -1)
print(f" location: line {line}, column {col}")
return True
errors = []
for lineno, line in enumerate(text.splitlines(keepends=True), 1):
for colno, char in enumerate(line, 1):
codepoint = ord(char)
if char == "\n":
continue
if (
not (0x20 <= codepoint <= 0x7E)
and codepoint not in allowed_unicode_codepoints
):
errors.append((lineno, colno, char, codepoint))
if errors:
for lineno, colno, char, codepoint in errors:
safe_char = repr(char)[1:-1] # nicely escape things like \u202f
print(
f"Invalid character at line {lineno}, column {colno}: U+{codepoint:04X} ({safe_char})"
)
if errors and fix:
print(f"Attempting to fix {filename}...")
num_replacements = 0
new_contents = ""
for char in text:
codepoint = ord(char)
if codepoint in substitutions:
num_replacements += 1
new_contents += substitutions[codepoint]
else:
new_contents += char
with open(filename, "w", encoding="utf-8") as f:
f.write(new_contents)
print(f"Fixed {num_replacements} of {len(errors)} errors in {filename}.")
return bool(errors)