in src/suggest_cls_streamlit.py [0:0]
def preprocess_text(text: str) -> str:
"""
Process the input text by splitting on a delimiter pattern, then conditionally removing
a trailing chunk (assumed to be domain-related) if it meets specific criteria.
The delimiter matches one or more of the characters in [|, –, -] when they are surrounded by
whitespace. For example: "Example - Domain" or "Example | Domain".
Args:
text (str): The text to be processed.
Returns:
str: The processed text or the original text if the conditions are not met.
"""
if not PREPROCESS: return text
# Regex pattern: matches one or more delimiters (|, – or -) between spaces.
delimiters = r"(?<=\s)[|–-]+(?=\s)"
# Split the text using the regex delimiter
split_text = re.split(delimiters, text)
# Ensure there is enough info from the text excluding the last part.
# Joining all but the last element should result in a string longer than 5 characters.
has_enough_info = len(split_text) > 0 and len(" ".join(split_text[:-1])) > 5
# In our context, the last part (potential domain information) should be short (< 20 characters)
# and there should be more than one segment after splitting.
is_potential_domain_info = len(split_text) > 1 and len(split_text[-1]) < 20
# If both conditions are met, process and return the cleaned text (excluding the last element).
if has_enough_info and is_potential_domain_info:
# Strip whitespace from each chunk, filter out empty strings, join them with a space,
# and trim any leading/trailing spaces.
processed = " ".join(chunk.strip() for chunk in split_text[:-1] if chunk.strip()).strip()
return processed
# Otherwise, just return the original text.
return text