in src/dfcx_scrapi/tools/nlu_util.py [0:0]
def find_new_groups(self, utterances):
utterances = np.unique(utterances)
embeddings = self.generate_embeddings(utterances)
train_nearest_idx, train_similarities = self.searcher.search_batched(
embeddings
)
new_searcher = self._build_searcher(embeddings)
new_nearest_idx, new_similarities = new_searcher.search_batched(
embeddings
)
# Count how many new utterances are more similar
# than any training phrase.
closer_count = np.sum(
new_similarities > train_similarities[:, :1], axis=1
)
# Pull out the largest groups.
grouped_utterance_ids = set()
groups = []
similar_training_phrases = []
similar_intents = []
training_phrase_distances = []
for utterance_idx in np.argsort(closer_count)[::-1]:
if utterance_idx in grouped_utterance_ids:
continue
if closer_count[utterance_idx] < 2:
break
group_utterances = []
for other_idx in new_nearest_idx[
utterance_idx, : closer_count[utterance_idx]
]:
if other_idx in grouped_utterance_ids:
# Some of the utterances in this group were
# assigned to another group already, ignore this group.
break
grouped_utterance_ids.add(other_idx)
group_utterances.append(utterances[other_idx])
else:
# Found a new group, add it.
match_idx = train_nearest_idx[utterance_idx, 0]
similar_training_phrases.append(
self.training_phrases[match_idx]
)
similar_intents.append(self.training_intents[match_idx])
training_phrase_distances.append(
train_similarities[utterance_idx, 0]
)
groups.append('"' + ('", "'.join(group_utterances)) + '"')
df = pd.DataFrame(
{
"Utterances": groups,
"Nearest Training Phrase": similar_training_phrases,
"Nearest Intent": similar_intents,
"Similarity": training_phrase_distances,
}
)
return df