in src/dfcx_scrapi/tools/nlu_util.py [0:0]
def find_similar_training_phrases_in_different_intents(self):
num_utterances = len(self.training_phrases)
all_idx_1 = np.tile(np.arange(num_utterances)[:, None], 10)
all_idx_2, similarities = self.searcher.search_batched(
self.training_embeddings
)
# Only keep pairs in different intents
def intents_differ(idx_1, idx_2):
return self.training_intents[idx_1] != self.training_intents[idx_2]
different_intent_mask = np.vectorize(intents_differ)(
all_idx_1, all_idx_2
)
mismatch_mask = different_intent_mask & (similarities > 0.8)
mismatch_idx_1 = all_idx_1[mismatch_mask]
mismatch_idx_2 = all_idx_2[mismatch_mask]
mismatch_similarities = similarities[mismatch_mask]
# Remove any duplicates
sort_mask = mismatch_idx_1 > mismatch_idx_2
sort_vals_1 = mismatch_idx_1[sort_mask]
mismatch_idx_1[sort_mask] = mismatch_idx_2[sort_mask]
mismatch_idx_2[sort_mask] = sort_vals_1
(unique_idx_1, unique_idx_2), unique_index = np.unique(
[mismatch_idx_1, mismatch_idx_2], axis=1, return_index=True
)
unique_similarities = mismatch_similarities[unique_index]
df = (
pd.DataFrame(
{
"Training phrase 1": self.training_phrases[unique_idx_1],
"Training phrase 2": self.training_phrases[unique_idx_2],
"Intent 1": self.training_intents[unique_idx_1],
"Intent 2": self.training_intents[unique_idx_2],
"Similarity": unique_similarities,
}
)
.sort_values("Similarity", ascending=False)
.reset_index(drop=True)
)
return df