def compute_character_repetition_ratio()

in obelics/processors/web_document_filtering.py [0:0]


    def compute_character_repetition_ratio(text, character_repetition_length):
        def get_freq_character_ngrams(text, n):
            character_ngrams = [text[i : i + n] for i in range(len(text) - n + 1)]
            freq_character_ngrams = Counter(character_ngrams)
            return freq_character_ngrams

        freq_character_ngrams = get_freq_character_ngrams(text=text, n=character_repetition_length)
        if len(freq_character_ngrams) == 0:
            return 0
        freq_character_ngrams = list(freq_character_ngrams.values())
        freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
        val_one = len([el for el in freq_character_ngrams if el == 1])
        num_rep_character_ngrams = min(
            int(np.sqrt(len(freq_character_ngrams))),
            len(freq_character_ngrams) - val_one,
        )
        character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(freq_character_ngrams)
        return character_repetition_ratio