def parse

def parse_url()

in lib/lambda/layers/aws-sap-odp-extractor/python/urllib3/util/url.py [0:0]
62 lines of code
27 McCabe index (conditional complexity)

def parse_url(url):
    """
    Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
    performed to parse incomplete urls. Fields not provided will be None.
    This parser is RFC 3986 compliant.

    :param str url: URL to parse into a :class:`.Url` namedtuple.

    Partly backwards-compatible with :mod:`urlparse`.

    Example::

        >>> parse_url('http://google.com/mail/')
        Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
        >>> parse_url('google.com:80')
        Url(scheme=None, host='google.com', port=80, path=None, ...)
        >>> parse_url('/foo?bar')
        Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
    """
    if not url:
        # Empty
        return Url()

    is_string = not isinstance(url, six.binary_type)

    # RFC 3986 doesn't like URLs that have a host but don't start
    # with a scheme and we support URLs like that so we need to
    # detect that problem and add an empty scheme indication.
    # We don't get hurt on path-only URLs here as it's stripped
    # off and given an empty scheme anyways.
    if not SCHEME_REGEX.search(url):
        url = "//" + url

    def idna_encode(name):
        if name and any([ord(x) > 128 for x in name]):
            try:
                import idna
            except ImportError:
                raise LocationParseError("Unable to parse URL without the 'idna' module")
            try:
                return idna.encode(name.lower(), strict=True, std3_rules=True)
            except idna.IDNAError:
                raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name)
        return name

    try:
        split_iri = misc.IRI_MATCHER.match(compat.to_str(url)).groupdict()
        iri_ref = rfc3986.IRIReference(
            split_iri['scheme'], split_iri['authority'],
            _encode_invalid_chars(split_iri['path'], PATH_CHARS),
            _encode_invalid_chars(split_iri['query'], QUERY_CHARS),
            _encode_invalid_chars(split_iri['fragment'], FRAGMENT_CHARS)
        )
        has_authority = iri_ref.authority is not None
        uri_ref = iri_ref.encode(idna_encoder=idna_encode)
    except (ValueError, RFC3986Exception):
        return six.raise_from(LocationParseError(url), None)

    # rfc3986 strips the authority if it's invalid
    if has_authority and uri_ref.authority is None:
        raise LocationParseError(url)

    # Only normalize schemes we understand to not break http+unix
    # or other schemes that don't follow RFC 3986.
    if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
        uri_ref = uri_ref.normalize()

    # Validate all URIReference components and ensure that all
    # components that were set before are still set after
    # normalization has completed.
    validator = Validator()
    try:
        validator.check_validity_of(
            *validator.COMPONENT_NAMES
        ).validate(uri_ref)
    except ValidationError:
        return six.raise_from(LocationParseError(url), None)

    # For the sake of backwards compatibility we put empty
    # string values for path if there are any defined values
    # beyond the path in the URL.
    # TODO: Remove this when we break backwards compatibility.
    path = uri_ref.path
    if not path:
        if (uri_ref.query is not None
                or uri_ref.fragment is not None):
            path = ""
        else:
            path = None

    # Ensure that each part of the URL is a `str` for
    # backwards compatibility.
    def to_input_type(x):
        if x is None:
            return None
        elif not is_string and not isinstance(x, six.binary_type):
            return x.encode('utf-8')
        return x

    return Url(
        scheme=to_input_type(uri_ref.scheme),
        auth=to_input_type(uri_ref.userinfo),
        host=to_input_type(uri_ref.host),
        port=int(uri_ref.port) if uri_ref.port is not None else None,
        path=to_input_type(path),
        query=to_input_type(uri_ref.query),
        fragment=to_input_type(uri_ref.fragment)
    )