in tools/unsupervised_dataset/sql_crawler/crawler.py [0:0]
def crawl(self):
""" Begins the crawling process using variables set earlier. Extracts
queries by locating website-specific HTML tags or searching for
common expression patterns. Writes queries to output after
finishing each site.
"""
while not self.link_queue.empty():
# Retrieve the next link in the queue
next_node = self.link_queue.get()
node_url = next_node.get_url()
node_depth = next_node.get_depth()
# Check if crawler has exceeded maximum depth or maximum count
if node_depth >= self.max_depth or self.count >= self.max_size:
self.log.close()
return
html_response = self.get_html(node_url)
if html_response is None:
continue
links = extractor.extract_links(html_response)
for link in links:
self.add_new_link(link, node_depth)
queries = extractor.extract_queries(html_response)
if queries:
self.log.log_queries(queries, node_url)
self.log.log_page(node_url, len(queries))
self.count += 1
self.log.close()