def extract_links()

in tools/unsupervised_dataset/sql_crawler/extractor.py [0:0]


def extract_links(html):
    """ Extracts links from HTML content of a site.

    Args:
        html: The HTML response which contains the HTML text.

    Returns:
        A list of URLs (strings).
    """

    try:
        content = bs4.BeautifulSoup(html.text, "html.parser")
    except Exception as e:
        print(html.url)
    link_tags = content.find_all("a")
    links = set([])

    for link in link_tags:
        if link.has_attr('href'):
            # Fix relative paths and anchor links
            absolute_path = urllib.parse.urljoin(html.url, link['href'])
            if "github.com" in absolute_path:
                continue
            if "#" in absolute_path:
                trimmed = absolute_path.split("#", 1)[0]
                links.add(trimmed)
            else:
                links.add(absolute_path)

    return links