in lib/ramble/ramble/util/web.py [0:0]
def spider(root_urls, depth=0, concurrency=32):
"""Get web pages from root URLs.
If depth is specified (e.g., depth=2), then this will also follow
up to <depth> levels of links from each root.
Args:
root_urls (str | list): root urls used as a starting point
for spidering
depth (int): level of recursion into links
concurrency (int): number of simultaneous requests that can be sent
Returns:
A dict of pages visited (URL) mapped to their full text and the
set of visited links.
"""
# Cache of visited links, meant to be captured by the closure below
_visited = set()
def _spider(url, collect_nested):
"""Fetches URL and any pages it links to.
Prints out a warning only if the root can't be fetched; it ignores
errors with pages that the root links to.
Args:
url (str): url being fetched and searched for links
collect_nested (bool): whether we want to collect arguments
for nested spidering on the links found in this url
Returns:
A tuple of:
- pages: dict of pages visited (URL) mapped to their full text.
- links: set of links encountered while visiting the pages.
- spider_args: argument for subsequent call to spider
"""
pages = {} # dict from page URL -> text content.
links = set() # set of all links seen on visited pages.
subcalls = []
try:
response_url, _, response = read_from_url(url, "text/html")
if not response_url or not response:
return pages, links, subcalls
page = codecs.getreader("utf-8")(response).read()
pages[response_url] = page
# Parse out the links in the page
link_parser = LinkParser()
link_parser.feed(page)
while link_parser.links:
raw_link = link_parser.links.pop()
abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True)
links.add(abs_link)
# Skip stuff that looks like an archive
if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES):
continue
# Skip already-visited links
if abs_link in _visited:
continue
# If we're not at max depth, follow links.
if collect_nested:
subcalls.append((abs_link,))
_visited.add(abs_link)
except URLError as e:
logger.debug(str(e))
if hasattr(e, "reason") and isinstance(e.reason, ssl.SSLError):
logger.warn(
"Ramble was unable to fetch url list due to a "
"certificate verification problem. You can try "
"running ramble -k, which will not check SSL "
"certificates. Use this at your own risk."
)
except HTMLParseError as e:
# This error indicates that Python's HTML parser sucks.
msg = "Got an error parsing HTML."
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
if sys.version_info[:3] < (2, 7, 3):
msg += " Use Python 2.7.3 or newer for better HTML parsing."
logger.warn(msg, url, "HTMLParseError: " + str(e))
except Exception as e:
# Other types of errors are completely ignored,
# except in debug mode
logger.debug(f"Error in _spider: {type(e)}:{str(e)}", traceback.format_exc())
finally:
logger.debug(f"SPIDER: [url={url}]")
return pages, links, subcalls
if isinstance(root_urls, str):
root_urls = [root_urls]
# Clear the local cache of visited pages before starting the search
_visited.clear()
current_depth = 0
pages, links, spider_args = {}, set(), []
collect = current_depth < depth
for root in root_urls:
root = url_util.parse(root)
spider_args.append((root, collect))
tp = multiprocessing.pool.ThreadPool(processes=concurrency)
try:
while current_depth <= depth:
logger.debug(
"SPIDER: [depth={}, max_depth={}, urls={}]".format(
current_depth, depth, len(spider_args)
)
)
results = tp.map(llnl.util.lang.star(_spider), spider_args)
spider_args = []
collect = current_depth < depth
for sub_pages, sub_links, sub_spider_args in results:
sub_spider_args = [x + (collect,) for x in sub_spider_args]
pages.update(sub_pages)
links.update(sub_links)
spider_args.extend(sub_spider_args)
current_depth += 1
finally:
tp.terminate()
tp.join()
return pages, links