python-threatexchange/threatexchange/common.py (36 lines of code) (raw):
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
A place to put simple helpers that don't seem like they go anywhere else.
If this file starts getting large, break it up.
"""
import argparse
import typing as t
import re
from urllib.parse import urlparse
import unicodedata
def class_name_to_human_name(name: str, suffix: str) -> str:
"""Helper to make human-friendly names using a class name as a template"""
if name.endswith(suffix):
name = name[: -len(suffix)]
return camel_case_to_underscore(name)
def camel_case_to_underscore(name: str) -> str:
"""
Convert name in camel-case notation into lowercase+underscore notation.
For example, AbcXyz will be converted into abc_xyz.
"""
s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
def normalize_string(s: str) -> str:
"""
Strip parts of the raw string to try and make matching more effective.
There are many redundant parts of input strings, or parts that don't
meaningfully contribute to whether its a match or not. Try and strip
as much of that as possible.
"""
# Lowercase
# CrAzY cAsE => crazy case
s = s.lower()
# Strip accent characters
# ãóë => aoe
s = "".join(
c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)
)
# Strip repeats of 2+
# w0000000t => w00t
s = re.sub("(.)(\1)+", "\1\1", s)
# Strip non alphanumerics (including spaces)
#
s = re.sub("[\W_]", "", s)
return s
def normalize_url(url: str) -> bytes:
"""
Normalize the URL and strip the scheme from the URL to make matching more effective.
Urls will be normalized to lowercase and the initial scheme removed as well as "utf-8" encoded.
"""
# Lowercase
# HtTPs://wWw.faCeBook.cOM => https://www.facebook.com
url = url.lower()
# parse the Url into it's consituent parts
parsed = urlparse(url)
# identify the scheme and trailing punctuation
# e.g. scheme = "http://"
scheme = "%s://" % parsed.scheme
# Remove the scheme from the full url
# https://www.facebook.com => www.facebook.com
url = parsed.geturl().replace(scheme, "", 1)
# Ensure URL is utf-8 encoded
return url.encode("utf-8")
def argparse_choices_pre_type(choices: t.List[str], type: t.Callable[[str], t.Any]):
"""
Argparse parses choices after type, which is sometimes undesirable.
So fix it with duct tape. type=argparse_choices_pre_type()
"""
def ret(s: str):
if s not in choices:
raise argparse.ArgumentTypeError(
"invalid choice: %s (choose from %s)",
s,
", ".join(repr(c) for c in choices),
)
return type(s)
return ret