def compute_word_repetition_ratio()

in obelics/processors/web_document_filtering.py [0:0]


    def compute_word_repetition_ratio(text, strip_characters, word_repetition_length):
        def get_freq_word_ngrams(text, strip_characters, n):
            words = FilteringFunctions.get_words_from_text(
                text=text, lower_case=True, strip_words=True, strip_characters=strip_characters
            )
            word_ngrams = [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
            freq_word_ngrams = Counter(word_ngrams)
            return freq_word_ngrams

        freq_word_ngrams = get_freq_word_ngrams(text=text, strip_characters=strip_characters, n=word_repetition_length)
        if len(freq_word_ngrams) == 0:
            return 0
        freq_word_ngrams = list(freq_word_ngrams.values())
        word_repetition_ratio = sum(freq for freq in freq_word_ngrams if freq > 1) / sum(freq_word_ngrams)
        return word_repetition_ratio