def find_new_groups()

in src/dfcx_scrapi/tools/nlu_util.py [0:0]


    def find_new_groups(self, utterances):
        utterances = np.unique(utterances)

        embeddings = self.generate_embeddings(utterances)

        train_nearest_idx, train_similarities = self.searcher.search_batched(
            embeddings
        )

        new_searcher = self._build_searcher(embeddings)
        new_nearest_idx, new_similarities = new_searcher.search_batched(
            embeddings
        )

        # Count how many new utterances are more similar
        # than any training phrase.
        closer_count = np.sum(
            new_similarities > train_similarities[:, :1], axis=1
        )

        # Pull out the largest groups.
        grouped_utterance_ids = set()
        groups = []
        similar_training_phrases = []
        similar_intents = []
        training_phrase_distances = []
        for utterance_idx in np.argsort(closer_count)[::-1]:
            if utterance_idx in grouped_utterance_ids:
                continue

            if closer_count[utterance_idx] < 2:
                break

            group_utterances = []
            for other_idx in new_nearest_idx[
                utterance_idx, : closer_count[utterance_idx]
            ]:
                if other_idx in grouped_utterance_ids:
                    # Some of the utterances in this group were
                    #  assigned to another group already, ignore this group.
                    break
                grouped_utterance_ids.add(other_idx)
                group_utterances.append(utterances[other_idx])
            else:
                # Found a new group, add it.
                match_idx = train_nearest_idx[utterance_idx, 0]
                similar_training_phrases.append(
                    self.training_phrases[match_idx]
                )
                similar_intents.append(self.training_intents[match_idx])
                training_phrase_distances.append(
                    train_similarities[utterance_idx, 0]
                )
                groups.append('"' + ('", "'.join(group_utterances)) + '"')

        df = pd.DataFrame(
            {
                "Utterances": groups,
                "Nearest Training Phrase": similar_training_phrases,
                "Nearest Intent": similar_intents,
                "Similarity": training_phrase_distances,
            }
        )

        return df