in src/jobs/util/labeled_data_utils.py [0:0]
def find_domain(article_url, top_domain=True):
"""
Returns a domain name from the article
"""
https_url_start = "https://"
http_url_start = "http://"
if article_url.startswith(https_url_start):
article_url = article_url.replace(https_url_start, "")
elif article_url.startswith(http_url_start):
article_url = article_url.replace(http_url_start, "")
domain_items = article_url.split("/")[0].split(".")
if domain_items[0] == "www":
domain_items = domain_items[1:]
if not top_domain:
return ".".join(domain_items)
if (
len(domain_items[-1]) == 2 or len(domain_items) <= 2
): # 2 char item likely a country domain -- return whole domain
return ".".join(domain_items[-3:])
else:
return ".".join(domain_items[-2:])