tools/unsupervised_dataset/sql_crawler/crawler.py (53 lines of code) (raw):

import queue import requests from sql_crawler import cq_node from sql_crawler import extractor from sql_crawler import crawler_log class Crawler(object): """ Contains the functions and logic to run and coordinate the crawling process. Given initial starting URLs and maximum size, the crawler will explore websites to look for SQL queries. """ def __init__(self, links, max_depth=3, max_size=100, gcs=None, bq=None, stream=False): """ Initializes the crawler and instance variables. Args: links: The root URLs to begin crawling from. Can be one or more. max_depth: The maximum depth for the crawler to explore. max_size: The maximum number of links the crawler should explore. gcs: Optional location to store result to Google Cloud Storage. bq: Optional location to store result to Google BigQuery. """ self.link_queue = queue.Queue() self.seen = set() self.max_depth = max_depth self.max_size = max_size self.log = crawler_log.CrawlerLog(stream) if gcs: self.log.set_gcs(gcs) if bq: self.log.set_bq(bq) self.count = 0 for link in links: self.link_queue.put(cq_node.CQNode(link, 0)) self.seen.add(link) def crawl(self): """ Begins the crawling process using variables set earlier. Extracts queries by locating website-specific HTML tags or searching for common expression patterns. Writes queries to output after finishing each site. """ while not self.link_queue.empty(): # Retrieve the next link in the queue next_node = self.link_queue.get() node_url = next_node.get_url() node_depth = next_node.get_depth() # Check if crawler has exceeded maximum depth or maximum count if node_depth >= self.max_depth or self.count >= self.max_size: self.log.close() return html_response = self.get_html(node_url) if html_response is None: continue links = extractor.extract_links(html_response) for link in links: self.add_new_link(link, node_depth) queries = extractor.extract_queries(html_response) if queries: self.log.log_queries(queries, node_url) self.log.log_page(node_url, len(queries)) self.count += 1 self.log.close() def add_new_link(self, link, parent_depth): """ Adds a new link to the queue with increased depth. Checks for duplicates against set, and does not add if the link has been seen before. Args: link: The link to be added to the queue. parent_depth: The depth of the parent link. The child will be added to the queue with parentDepth + 1. """ if link in self.seen: return self.link_queue.put(cq_node.CQNode(link, parent_depth+1)) self.seen.add(link) def get_html(self, url): """ Fetches HTML content for a webpage. Logs an error and moves on if there is an exception. Args: url: The url for the webpage being requested. Returns: An HTML response, or None if there is an exception. """ try: req = requests.get(url) req.raise_for_status() return req except requests.exceptions.RequestException as err: self.log.log_error(str(err)) return None