python-threatexchange/threatexchange/common.py

#!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved """ A place to put simple helpers that don't seem like they go anywhere else. If this file starts getting large, break it up. """ import argparse import typing as t import re from urllib.parse import urlparse import unicodedata def class_name_to_human_name(name: str, suffix: str) -> str: """Helper to make human-friendly names using a class name as a template""" if name.endswith(suffix): name = name[: -len(suffix)] return camel_case_to_underscore(name) def camel_case_to_underscore(name: str) -> str: """ Convert name in camel-case notation into lowercase+underscore notation. For example, AbcXyz will be converted into abc_xyz. """ s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name) return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower() def normalize_string(s: str) -> str: """ Strip parts of the raw string to try and make matching more effective. There are many redundant parts of input strings, or parts that don't meaningfully contribute to whether its a match or not. Try and strip as much of that as possible. """ # Lowercase # CrAzY cAsE => crazy case s = s.lower() # Strip accent characters # ãóë => aoe s = "".join( c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c) ) # Strip repeats of 2+ # w0000000t => w00t s = re.sub("(.)(\1)+", "\1\1", s) # Strip non alphanumerics (including spaces) # s = re.sub("[\W_]", "", s) return s def normalize_url(url: str) -> bytes: """ Normalize the URL and strip the scheme from the URL to make matching more effective. Urls will be normalized to lowercase and the initial scheme removed as well as "utf-8" encoded. """ # Lowercase # HtTPs://wWw.faCeBook.cOM => https://www.facebook.com url = url.lower() # parse the Url into it's consituent parts parsed = urlparse(url) # identify the scheme and trailing punctuation # e.g. scheme = "http://" scheme = "%s://" % parsed.scheme # Remove the scheme from the full url # https://www.facebook.com => www.facebook.com url = parsed.geturl().replace(scheme, "", 1) # Ensure URL is utf-8 encoded return url.encode("utf-8") def argparse_choices_pre_type(choices: t.List[str], type: t.Callable[[str], t.Any]): """ Argparse parses choices after type, which is sometimes undesirable. So fix it with duct tape. type=argparse_choices_pre_type() """ def ret(s: str): if s not in choices: raise argparse.ArgumentTypeError( "invalid choice: %s (choose from %s)", s, ", ".join(repr(c) for c in choices), ) return type(s) return ret

python-threatexchange/threatexchange/common.py (36 lines of code) (raw):