in bleach/sanitizer.py [0:0]
def sanitize_uri_value(self, value, allowed_protocols):
"""Checks a uri value to see if it's allowed
:arg value: the uri value to sanitize
:arg allowed_protocols: set of allowed protocols
:returns: allowed value or None
"""
# NOTE(willkg): This transforms the value into a normalized one that's
# easier to match and verify, but shouldn't get returned since it's
# vastly different than the original value.
# Convert all character entities in the value
normalized_uri = html5lib_shim.convert_entities(value)
# Nix backtick, space characters, and control characters
normalized_uri = re.sub(r"[`\000-\040\177-\240\s]+", "", normalized_uri)
# Remove REPLACEMENT characters
normalized_uri = normalized_uri.replace("\ufffd", "")
# Lowercase it--this breaks the value, but makes it easier to match
# against
normalized_uri = normalized_uri.lower()
try:
# Drop attributes with uri values that have protocols that aren't
# allowed
parsed = parse_shim.urlparse(normalized_uri)
except ValueError:
# URI is impossible to parse, therefore it's not allowed
return None
if parsed.scheme:
# If urlparse found a scheme, check that
if parsed.scheme in allowed_protocols:
return value
else:
# Allow uris that are just an anchor
if normalized_uri.startswith("#"):
return value
# Handle protocols that urlparse doesn't recognize like "myprotocol"
if (
":" in normalized_uri
and normalized_uri.split(":")[0] in allowed_protocols
):
return value
# If there's no protocol/scheme specified, then assume it's "http" or
# "https" and see if that's allowed
if "http" in allowed_protocols or "https" in allowed_protocols:
return value
return None