def sanitize_uri_value()

in bleach/sanitizer.py [0:0]


    def sanitize_uri_value(self, value, allowed_protocols):
        """Checks a uri value to see if it's allowed

        :arg value: the uri value to sanitize
        :arg allowed_protocols: set of allowed protocols

        :returns: allowed value or None

        """
        # NOTE(willkg): This transforms the value into a normalized one that's
        # easier to match and verify, but shouldn't get returned since it's
        # vastly different than the original value.

        # Convert all character entities in the value
        normalized_uri = html5lib_shim.convert_entities(value)

        # Nix backtick, space characters, and control characters
        normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)

        # Remove REPLACEMENT characters
        normalized_uri = normalized_uri.replace("\ufffd", "")

        # Lowercase it--this breaks the value, but makes it easier to match
        # against
        normalized_uri = normalized_uri.lower()

        try:
            # Drop attributes with uri values that have protocols that aren't
            # allowed
            parsed = parse_shim.urlparse(normalized_uri)
        except ValueError:
            # URI is impossible to parse, therefore it's not allowed
            return None

        if parsed.scheme:
            # If urlparse found a scheme, check that
            if parsed.scheme in allowed_protocols:
                return value

        else:
            # Allow uris that are just an anchor
            if normalized_uri.startswith("#"):
                return value

            # Handle protocols that urlparse doesn't recognize like "myprotocol"
            if (
                ":" in normalized_uri
                and normalized_uri.split(":")[0] in allowed_protocols
            ):
                return value

            # If there's no protocol/scheme specified, then assume it's "http" or
            # "https" and see if that's allowed
            if "http" in allowed_protocols or "https" in allowed_protocols:
                return value

        return None