in src/utils/str.py [0:0]
def batched_sentence_overlap(text_list1, text_list2, show_progress=True, text_line_counter1=None):
from nltk.tokenize import sent_tokenize
if text_line_counter1 is None:
if show_progress:
text_list1 = tqdm(text_list1, desc='processing text1')
text_line_counter1 = [Counter(sent_tokenize(text)) for text in text_list1]
if show_progress:
text_list2 = tqdm(text_list2, desc='processing text2')
text_line_counter2 = [Counter(sent_tokenize(text)) for text in text_list2]
n, m = len(text_list1), len(text_list2)
res_matrix = np.zeros((n, m))
pbar = tqdm(text_line_counter1, desc='sentence computing') if show_progress else text_line_counter1
repreat_sents = []
for i, line_counter1 in enumerate(pbar):
for j, line_counter2 in enumerate(text_line_counter2):
overlap = line_counter1 & line_counter2
union = line_counter1 | line_counter2
num_same = sum(overlap.values())
if num_same > 0:
num_union = sum(union.values())
assert num_union != 0
res_matrix[i, j] = num_same / num_union
repreat_sents.extend([k for k, v in overlap.items() if v > 0])
return res_matrix, text_line_counter1, repreat_sents