mozregression/network.py (52 lines of code) (raw):

""" network functions utilities for mozregression. """ from __future__ import absolute_import import re from urllib.parse import urljoin import redo import requests from bs4 import BeautifulSoup def retry_get(url, **karwgs): """ More robust `requests.get` equivalent function. This is equivalent to the requests.get function, except that it will retry the requests call three times in case of HTTPError or ConnectionError. """ return redo.retry( get_http_session().get, attempts=3, sleeptime=1, retry_exceptions=( requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout, ), args=(url,), kwargs=karwgs, ) SESSION = None def set_http_session(session=None, get_defaults=None): """ Define a cache http session. :param cache_session: a customized request session or None to use a simple request session. :param: get_defaults: if defined, it must be a dict that will provide default values for calls to cache_session.get. """ global SESSION if get_defaults: if session is None: session = requests.Session() # monkey patch to set default values to a session.get calls # I don't see other ways to do this globally for timeout for example _get = session.get def _default_get(*args, **kwargs): for k, v in get_defaults.items(): kwargs.setdefault(k, v) return _get(*args, **kwargs) session.get = _default_get SESSION = session def get_http_session(): """ Returns the defined http session. """ return SESSION or requests def url_links(url, regex=None, auth=None): """ Returns a list of links that can be found on a given web page. """ response = retry_get(url, auth=auth) response.raise_for_status() soup = BeautifulSoup(response.text, features="html.parser") if regex: if isinstance(regex, str): regex = re.compile(regex) match = regex.match else: def match(_): return True # do not return a generator but an array, so we can store it for later use result = [] for link in soup.findAll("a"): href = link.get("href") ahref = urljoin(url, href) if match(ahref): result.append(ahref) return result