def preprocess_text()

in src/suggest_cls_streamlit.py [0:0]


def preprocess_text(text: str) -> str:
    """
    Process the input text by splitting on a delimiter pattern, then conditionally removing
    a trailing chunk (assumed to be domain-related) if it meets specific criteria.

    The delimiter matches one or more of the characters in [|, –, -] when they are surrounded by
    whitespace. For example: "Example - Domain" or "Example | Domain".

    Args:
        text (str): The text to be processed.

    Returns:
        str: The processed text or the original text if the conditions are not met.
    """
    if not PREPROCESS: return text

    # Regex pattern: matches one or more delimiters (|, – or -) between spaces.
    delimiters = r"(?<=\s)[|–-]+(?=\s)"
    # Split the text using the regex delimiter
    split_text = re.split(delimiters, text)

    # Ensure there is enough info from the text excluding the last part.
    # Joining all but the last element should result in a string longer than 5 characters.
    has_enough_info = len(split_text) > 0 and len(" ".join(split_text[:-1])) > 5

    # In our context, the last part (potential domain information) should be short (< 20 characters)
    # and there should be more than one segment after splitting.
    is_potential_domain_info = len(split_text) > 1 and len(split_text[-1]) < 20

    # If both conditions are met, process and return the cleaned text (excluding the last element).
    if has_enough_info and is_potential_domain_info:
        # Strip whitespace from each chunk, filter out empty strings, join them with a space,
        # and trim any leading/trailing spaces.
        processed = " ".join(chunk.strip() for chunk in split_text[:-1] if chunk.strip()).strip()
        return processed

    # Otherwise, just return the original text.
    return text