in obelics/processors/web_document_filtering.py [0:0]
def get_words_from_text(text, lower_case=True, strip_words=True, strip_characters=SPECIAL_CHARACTERS):
"""Get words from a text. Non reversible since the text
is split on multiple characters, words are stripped of
special characters and characters are converted to lower case.
Useful to compute ratios, like the stopword ratio."""
if strip_words and strip_characters is None:
raise ValueError("strip_characters must be provided if strip_words is True.")
words = FilteringFunctions.split_on_whitespace(text=text, new_line=True, tab=True)
if lower_case:
words = [word.lower() for word in words]
if strip_words:
words = [FilteringFunctions.strip(word, strip_characters) for word in words]
words = FilteringFunctions.remove_empty_el_from_list(words)
return words