in sourcecode/scoring/topic_model.py [0:0]
def _make_seed_labels(self, texts: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""Produce a label vector based on seed terms.
Args:
texts: array containing strings for topic assignment
Returns:
Tuple[0]: array specifying topic labels for texts
Tuple[1]: array specifying texts that are unassigned due to conflicting matches.
"""
labels = np.zeros(texts.shape[0], dtype=np.int64)
conflictedTexts = np.zeros(texts.shape[0], dtype=bool)
for i, text in enumerate(texts):
matches = self._compiled_regex.finditer(text.lower())
found_topics = set()
for match in matches:
found_topics.update([Topics[grp].value for grp in match.groupdict() if match.group(grp)])
if len(found_topics) == 1:
labels[i] = found_topics.pop()
elif len(found_topics) > 1:
labels[i] = Topics.Unassigned.value
conflictedTexts[i] = True
unassigned_count = np.sum(conflictedTexts)
logger.info(f" Notes unassigned due to multiple matches: {unassigned_count}")
return labels, conflictedTexts