in sourcecode/scoring/topic_model.py [0:0]
def custom_tokenizer(self, text):
# This pattern captures help.x.com or x.com/tos even if preceded by http(s):// and with optional trailing paths,
# otherwise falls back to matching words of at least 2 characters.
default_preprocessor = CountVectorizer(
strip_accents="unicode", lowercase=True
).build_preprocessor()
text = default_preprocessor(text)
seed_patterns = [
r"(?:https?://)?(" + term + r")(?:/[^\s]+)?|" for term in get_seed_term_with_periods()
]
pattern_string = r"(?i)" + "".join(seed_patterns + [r"\b\w\w+\b"])
pattern = re.compile(pattern_string)
# For any match groups (e.g. URLs), return just the group. Else return whole word.
tokens = []
for match in pattern.finditer(text):
# Look for the first non-None seed term group in the match groups.
seed_term = next((g for g in match.groups() if g is not None), None)
if seed_term is not None:
tokens.append(seed_term)
else:
tokens.append(match.group(0))
return tokens