def find_domain()

in src/jobs/util/labeled_data_utils.py [0:0]


def find_domain(article_url, top_domain=True):
    """
    Returns a domain name from the article
    """
    https_url_start = "https://"
    http_url_start = "http://"

    if article_url.startswith(https_url_start):
        article_url = article_url.replace(https_url_start, "")
    elif article_url.startswith(http_url_start):
        article_url = article_url.replace(http_url_start, "")
    domain_items = article_url.split("/")[0].split(".")
    if domain_items[0] == "www":
        domain_items = domain_items[1:]
    if not top_domain:
        return ".".join(domain_items)
    if (
            len(domain_items[-1]) == 2 or len(domain_items) <= 2
    ):  # 2 char item likely a country domain -- return whole domain
        return ".".join(domain_items[-3:])
    else:
        return ".".join(domain_items[-2:])