def sanitize_line()

in project/paperbench/paperbench/judge/utils.py [0:0]


def sanitize_line(line: str) -> str:
    """
    Convert ephemeral bits (timestamps, progress bars, numeric tokens, IPs, etc.)
    into placeholders so that repeated patterns can be more easily detected.
    """

    # Mask ISO8601 Timestamps (e.g. 2025-01-28T18:47:06.1465140Z)
    line = re.sub(r"\d{4}-\d{2}-\d{2}T[0-9:.]+Z", "<TIMESTAMP>", line)

    # Mask typical date/time strings (e.g. 2025-01-28 18:47:06 or 18:47:06)
    line = re.sub(r"\b\d{4}-\d{2}-\d{2}\b", "<DATE>", line)
    line = re.sub(r"\b\d{2}:\d{2}:\d{2}\b", "<TIME>", line)

    # TQDM or other progress bars: remove generic progress bar lines by matching percentage and bar or repeated progress symbols
    if (
        re.search(r"\d+%?\|[█=]+", line)
        or re.search(r"[KMG]?B/s", line)
        or re.search(r"\d+%\s*\|", line)
        or re.search(r"[▏▎▍▌▋▊▉]{2,}", line)
    ):
        line = "<PROGRESS_BAR>"

    # IP addresses  (1-3 digits).(1-3).(1-3).(1-3)
    line = re.sub(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b", "<IP>", line)

    # Mask long hex strings (common in commit hashes, container IDs, etc.)
    line = re.sub(r"\b[0-9a-fA-F]{8,}\b", "<HEX>", line)

    return line