in src/suggest_cls_streamlit.py [0:0]
def preprocess_url(url):
if not PREPROCESS: return url
parsed = urlparse(url)
tracking_keys = {"utm_source", "utm_medium", "utm_campaign", "ref"}
query_dict = {
k: v for k, v in parse_qs(parsed.query).items() if k not in tracking_keys
}
# Flatten query parameters like {"id": ["123"]} => ["id=123"]
query_items = []
for k, vals in query_dict.items():
for val in vals:
query_items.append(f"{k}={val}")
domain = parsed.netloc.replace("www.", "")
path_str = "/".join(p for p in parsed.path.split("/") if p)
parts = []
if domain:
parts.append(domain)
if path_str:
parts.append(path_str)
parts.extend(query_items)
if parsed.fragment:
parts.append(parsed.fragment)
return " ".join(parts).replace(".html", "")