def crawl()

in tools/unsupervised_dataset/sql_crawler/crawler.py [0:0]


    def crawl(self):
        """ Begins the crawling process using variables set earlier. Extracts
            queries by locating website-specific HTML tags or searching for
            common expression patterns. Writes queries to output after
            finishing each site.
        """

        while not self.link_queue.empty():
            # Retrieve the next link in the queue
            next_node = self.link_queue.get()
            node_url = next_node.get_url()
            node_depth = next_node.get_depth()

            # Check if crawler has exceeded maximum depth or maximum count
            if node_depth >= self.max_depth or self.count >= self.max_size:
                self.log.close()
                return

            html_response = self.get_html(node_url)
            if html_response is None:
                continue

            links = extractor.extract_links(html_response)
            for link in links:
                self.add_new_link(link, node_depth)

            queries = extractor.extract_queries(html_response)
            if queries:
                self.log.log_queries(queries, node_url)

            self.log.log_page(node_url, len(queries))
            self.count += 1

        self.log.close()