def find_similar_training_phrases_in_different_intents()

in src/dfcx_scrapi/tools/nlu_util.py [0:0]


    def find_similar_training_phrases_in_different_intents(self):
        num_utterances = len(self.training_phrases)
        all_idx_1 = np.tile(np.arange(num_utterances)[:, None], 10)
        all_idx_2, similarities = self.searcher.search_batched(
            self.training_embeddings
        )

        # Only keep pairs in different intents
        def intents_differ(idx_1, idx_2):
            return self.training_intents[idx_1] != self.training_intents[idx_2]

        different_intent_mask = np.vectorize(intents_differ)(
            all_idx_1, all_idx_2
        )

        mismatch_mask = different_intent_mask & (similarities > 0.8)
        mismatch_idx_1 = all_idx_1[mismatch_mask]
        mismatch_idx_2 = all_idx_2[mismatch_mask]
        mismatch_similarities = similarities[mismatch_mask]

        # Remove any duplicates
        sort_mask = mismatch_idx_1 > mismatch_idx_2
        sort_vals_1 = mismatch_idx_1[sort_mask]
        mismatch_idx_1[sort_mask] = mismatch_idx_2[sort_mask]
        mismatch_idx_2[sort_mask] = sort_vals_1
        (unique_idx_1, unique_idx_2), unique_index = np.unique(
            [mismatch_idx_1, mismatch_idx_2], axis=1, return_index=True
        )
        unique_similarities = mismatch_similarities[unique_index]

        df = (
            pd.DataFrame(
                {
                    "Training phrase 1": self.training_phrases[unique_idx_1],
                    "Training phrase 2": self.training_phrases[unique_idx_2],
                    "Intent 1": self.training_intents[unique_idx_1],
                    "Intent 2": self.training_intents[unique_idx_2],
                    "Similarity": unique_similarities,
                }
            )
            .sort_values("Similarity", ascending=False)
            .reset_index(drop=True)
        )

        return df