def batched_ngram_overlap()

in src/utils/str.py [0:0]


def batched_ngram_overlap(text_list1, text_list2, n, show_progress=True, ngram_list1=None):
    def generate_ngrams(text, n):
        return Counter(zip(*[text[i:] for i in range(n)]))
    
    if ngram_list1 is None:
        if show_progress:
            text_list1 = tqdm(text_list1, desc='processing text1')
        ngram_list1 = [generate_ngrams(text, n) for text in text_list1]

    if show_progress:
        text_list2 = tqdm(text_list2, desc='processing text2')
    ngram_list2 = [generate_ngrams(text, n) for text in text_list2]

    _n, _m = len(text_list1), len(text_list2)
    res_matrix = np.zeros((_n, _m))
    pbar = tqdm(ngram_list1, desc=f'{n}-gram computing') if show_progress else ngram_list1
    repreat_sents = []
    for i, ngram1 in enumerate(pbar):
        for j, ngram2 in enumerate(ngram_list2):
            overlap = ngram1 & ngram2
            overlap_ratio = sum(overlap.values()) / float(sum(ngram1.values()) + sum(ngram2.values()))
            res_matrix[i, j] = overlap_ratio

            if overlap_ratio > 0:
                repreat_sents.extend([k for k, v in overlap.items() if v > 0])

    return res_matrix, ngram_list1, repreat_sents