def is_overlapping()

in muss/mining/filtering.py [0:0]


def is_overlapping(source, simplification):
    def ordered_is_overlap(previous_text_window, next_text_window):
        '''Check if the end of one sentence is exactly the same as the beginning of the other'''
        overlapping_ratio_threshold = 0.25  # Minimum ratio of overlapping characters to be considered an overlap
        min_length = min(len(previous_text_window), len(next_text_window))
        min_overlap_length = int(min_length * overlapping_ratio_threshold) + 1
        min_possible_overlapping_text = next_text_window[:min_overlap_length]
        index = 0
        while True:
            index = previous_text_window.find(min_possible_overlapping_text, index + 1)
            if index == -1:  # Not found
                return False
            possible_overlapping_text = previous_text_window[index:]
            if next_text_window.startswith(possible_overlapping_text):
                return True

    return ordered_is_overlap(source, simplification) or ordered_is_overlap(simplification, source)