in obelics/processors/web_document_filtering.py [0:0]
def compute_word_repetition_ratio(text, strip_characters, word_repetition_length):
def get_freq_word_ngrams(text, strip_characters, n):
words = FilteringFunctions.get_words_from_text(
text=text, lower_case=True, strip_words=True, strip_characters=strip_characters
)
word_ngrams = [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
freq_word_ngrams = Counter(word_ngrams)
return freq_word_ngrams
freq_word_ngrams = get_freq_word_ngrams(text=text, strip_characters=strip_characters, n=word_repetition_length)
if len(freq_word_ngrams) == 0:
return 0
freq_word_ngrams = list(freq_word_ngrams.values())
word_repetition_ratio = sum(freq for freq in freq_word_ngrams if freq > 1) / sum(freq_word_ngrams)
return word_repetition_ratio