in tools/unsupervised_dataset/sql_crawler/extractor.py [0:0]
def extract_links(html):
""" Extracts links from HTML content of a site.
Args:
html: The HTML response which contains the HTML text.
Returns:
A list of URLs (strings).
"""
try:
content = bs4.BeautifulSoup(html.text, "html.parser")
except Exception as e:
print(html.url)
link_tags = content.find_all("a")
links = set([])
for link in link_tags:
if link.has_attr('href'):
# Fix relative paths and anchor links
absolute_path = urllib.parse.urljoin(html.url, link['href'])
if "github.com" in absolute_path:
continue
if "#" in absolute_path:
trimmed = absolute_path.split("#", 1)[0]
links.add(trimmed)
else:
links.add(absolute_path)
return links