in sourcecode/scoring/topic_model.py [0:0]
def _get_stop_words(self, texts: np.ndarray) -> List[str]:
"""Identify tokens in the extracted vocabulary that contain seed terms.
Any token containing a seed term will be treated as a stop word (i.e. removed
from the extracted features). This prevents the model from training on the same
tokens used to label the data.
Args:
texts: array containing strings for topic assignment
Returns:
List specifying which tokens to exclude from the features.
"""
# Extract vocabulary
cv = CountVectorizer(tokenizer=self.custom_tokenizer, token_pattern=None)
cv.fit(texts)
rawVocabulary = cv.vocabulary_.keys()
logger.info(f" Initial vocabulary length: {len(rawVocabulary)}")
# Identify stop words
blockedTokens = set()
for terms in self._seedTerms.values():
# Remove whitespace and any escaped whitespace characters from seed terms
blockedTokens |= {re.sub(r"\\s", "", t.strip()) for t in terms}
# Convert escaped periods to periods
blockedTokens |= {re.sub(r"\\.", ".", t.strip()) for t in terms}
logger.info(f" Total tokens to filter: {len(blockedTokens)}")
stopWords = [v for v in rawVocabulary if any(t in v for t in blockedTokens)]
logger.info(f" Total identified stopwords: {len(stopWords)}")
return stopWords