in project/paperbench/paperbench/judge/utils.py [0:0]
def sanitize_line(line: str) -> str:
"""
Convert ephemeral bits (timestamps, progress bars, numeric tokens, IPs, etc.)
into placeholders so that repeated patterns can be more easily detected.
"""
# Mask ISO8601 Timestamps (e.g. 2025-01-28T18:47:06.1465140Z)
line = re.sub(r"\d{4}-\d{2}-\d{2}T[0-9:.]+Z", "<TIMESTAMP>", line)
# Mask typical date/time strings (e.g. 2025-01-28 18:47:06 or 18:47:06)
line = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "<DATE>", line)
line = re.sub(r"\b\d{2}:\d{2}:\d{2}\b", "<TIME>", line)
# TQDM or other progress bars: remove generic progress bar lines by matching percentage and bar or repeated progress symbols
if (
re.search(r"\d+%?\|[█=]+", line)
or re.search(r"[KMG]?B/s", line)
or re.search(r"\d+%\s*\|", line)
or re.search(r"[▏▎▍▌▋▊▉]{2,}", line)
):
line = "<PROGRESS_BAR>"
# IP addresses (1-3 digits).(1-3).(1-3).(1-3)
line = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "<IP>", line)
# Mask long hex strings (common in commit hashes, container IDs, etc.)
line = re.sub(r"\b[0-9a-fA-F]{8,}\b", "<HEX>", line)
return line