def preprocess_url()

in src/suggest_cls_streamlit.py [0:0]


def preprocess_url(url):
    if not PREPROCESS: return url

    parsed = urlparse(url)
    tracking_keys = {"utm_source", "utm_medium", "utm_campaign", "ref"}
    query_dict = {
        k: v for k, v in parse_qs(parsed.query).items() if k not in tracking_keys
    }

    # Flatten query parameters like {"id": ["123"]} => ["id=123"]
    query_items = []
    for k, vals in query_dict.items():
        for val in vals:
            query_items.append(f"{k}={val}")

    domain = parsed.netloc.replace("www.", "")
    path_str = "/".join(p for p in parsed.path.split("/") if p)

    parts = []
    if domain:
        parts.append(domain)
    if path_str:
        parts.append(path_str)
    parts.extend(query_items)
    if parsed.fragment:
        parts.append(parsed.fragment)

    return " ".join(parts).replace(".html", "")