tools/unsupervised_dataset/run_crawler.py (22 lines of code) (raw):

""" Script to initialize the SQL crawler on a website of the user's choice """ import sys import argparse import logging from sql_crawler import crawler def start_crawler(): """ Parses command-line args and starts the crawler. """ parser = argparse.ArgumentParser(description="SQL Web Crawler") parser.add_argument("urls", help="A space-separated list of URLs to be crawled", nargs='+') parser.add_argument("--max_depth", help="The max depth of the crawler (default=3)", type=int, default=3) parser.add_argument("--max_size", help="The maximum number of links to be crawled (default=100)", type=int, default=100) parser.add_argument("--cloud_storage", help="Project and bucket to store in GCS. Formatted as project_id.bucket (default=None)", default=None) parser.add_argument("--bigquery", help="Project and dataset to store in BQ. Formatted as project_id.dataset (default=None)", default=None) parser.add_argument("--stream", help="Only stream data instead of saving locally. Simply put '--stream' to set this; no variable required afterward. Requires --bigquery variable to be set as well", action='store_true', default=False) args = parser.parse_args() if args.stream and args.bigquery is None: logging.error("Need to specify BigQuery table if streaming data") return new_crawler = crawler.Crawler(args.urls, max_size=args.max_size, max_depth=args.max_depth, gcs=args.cloud_storage, bq=args.bigquery, stream=args.stream) new_crawler.crawl() def main(): start_crawler() if __name__ == "__main__": main()