tools/unsupervised_dataset/sql_crawler/extractor.py (34 lines of code) (raw):

""" Contains the logic to extract links and SQL queries from HTML content. This file contains functions to extract links or queries when given an HTML response. It also dynamically extracts links based on the URL of the response. """ import urllib import bs4 import re import sql_crawler.extraction_modules.generic_extraction_module as generic_extraction import sql_crawler.extraction_modules.google_extraction_module as google_extraction GOOGLE_CLOUD = "cloud.google.com" def extract_links(html): """ Extracts links from HTML content of a site. Args: html: The HTML response which contains the HTML text. Returns: A list of URLs (strings). """ try: content = bs4.BeautifulSoup(html.text, "html.parser") except Exception as e: print(html.url) link_tags = content.find_all("a") links = set([]) for link in link_tags: if link.has_attr('href'): # Fix relative paths and anchor links absolute_path = urllib.parse.urljoin(html.url, link['href']) if "github.com" in absolute_path: continue if "#" in absolute_path: trimmed = absolute_path.split("#", 1)[0] links.add(trimmed) else: links.add(absolute_path) return links def extract_queries(html): """ Extracts queries from HTML content of a site. Args: html: The HTML response which contains the HTML text. Returns: A list of queries (strings) """ extractor_module = retrieve_module(html.url) found_queries = extractor_module.find_queries(html) cleaned_queries = [re.sub("\s+", " ", query) for query in found_queries] return cleaned_queries def retrieve_module(url): """ Retrieves the correct module to use for extracting queries from a specific site. If there is no module for pages under this domain, it returns a generic module. Args: url: The URL for the site being crawled. Returns: A extraction module, which contains a findQueries function for extracting queries. """ if GOOGLE_CLOUD in url: return google_extraction.GoogleExtractionModule else: return generic_extraction.GenericExtractionModule