def tokenize()

in decontamination/decontaminate.py [0:0]


def tokenize(text):
    """Normalize text by removing diacritics and tokenize."""
    text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn")
    tokens = re.findall("\w+", text.lower())
    return tokens