def custom_tokenizer()

in sourcecode/scoring/topic_model.py [0:0]


  def custom_tokenizer(self, text):
    # This pattern captures help.x.com or x.com/tos even if preceded by http(s):// and with optional trailing paths,
    # otherwise falls back to matching words of at least 2 characters.
    default_preprocessor = CountVectorizer(
      strip_accents="unicode", lowercase=True
    ).build_preprocessor()
    text = default_preprocessor(text)

    seed_patterns = [
      r"(?:https?://)?(" + term + r")(?:/[^\s]+)?|" for term in get_seed_term_with_periods()
    ]
    pattern_string = r"(?i)" + "".join(seed_patterns + [r"\b\w\w+\b"])
    pattern = re.compile(pattern_string)

    # For any match groups (e.g. URLs), return just the group. Else return whole word.
    tokens = []
    for match in pattern.finditer(text):
      # Look for the first non-None seed term group in the match groups.
      seed_term = next((g for g in match.groups() if g is not None), None)
      if seed_term is not None:
        tokens.append(seed_term)
      else:
        tokens.append(match.group(0))
    return tokens