in obelics/processors/web_document_filtering.py [0:0]
def compute_character_repetition_ratio(text, character_repetition_length):
def get_freq_character_ngrams(text, n):
character_ngrams = [text[i : i + n] for i in range(len(text) - n + 1)]
freq_character_ngrams = Counter(character_ngrams)
return freq_character_ngrams
freq_character_ngrams = get_freq_character_ngrams(text=text, n=character_repetition_length)
if len(freq_character_ngrams) == 0:
return 0
freq_character_ngrams = list(freq_character_ngrams.values())
freq_character_ngrams = sorted(freq_character_ngrams, reverse=True)
val_one = len([el for el in freq_character_ngrams if el == 1])
num_rep_character_ngrams = min(
int(np.sqrt(len(freq_character_ngrams))),
len(freq_character_ngrams) - val_one,
)
character_repetition_ratio = sum(freq_character_ngrams[:num_rep_character_ngrams]) / sum(freq_character_ngrams)
return character_repetition_ratio