def batched_sentence_overlap()

in src/utils/str.py [0:0]


def batched_sentence_overlap(text_list1, text_list2, show_progress=True, text_line_counter1=None):
    from nltk.tokenize import sent_tokenize
    
    if text_line_counter1 is None:
        if show_progress:
            text_list1 = tqdm(text_list1, desc='processing text1')
        text_line_counter1 = [Counter(sent_tokenize(text)) for text in text_list1]
    
    if show_progress:
        text_list2 = tqdm(text_list2, desc='processing text2')
    text_line_counter2 = [Counter(sent_tokenize(text)) for text in text_list2]

    n, m = len(text_list1), len(text_list2)
    res_matrix = np.zeros((n, m))
    pbar = tqdm(text_line_counter1, desc='sentence computing') if show_progress else text_line_counter1
    repreat_sents = []
    for i, line_counter1 in enumerate(pbar):
        for j, line_counter2 in enumerate(text_line_counter2):
            overlap = line_counter1 & line_counter2
            union = line_counter1 | line_counter2
            num_same = sum(overlap.values())

            if num_same > 0:
                num_union = sum(union.values())
                assert num_union != 0
                res_matrix[i, j] = num_same / num_union

                repreat_sents.extend([k for k, v in overlap.items() if v > 0])
             
    return res_matrix, text_line_counter1, repreat_sents