in src/embeddings_streamlit.py [0:0]
def preprocess(text):
delimiters = r"[|–-]"
split_text = re.split(delimiters, text)
enough_info_first = len(split_text) > 0 and len(text) - len(split_text[0]) > 5
is_potential_domain_info = len(split_text) > 1 and len(split_text[-1]) < 20
if enough_info_first and is_potential_domain_info:
return ' '.join(filter(None, split_text[:-1])).strip().lower()
return text.lower()