def get_words_from_text()

in obelics/processors/web_document_filtering.py [0:0]


    def get_words_from_text(text, lower_case=True, strip_words=True, strip_characters=SPECIAL_CHARACTERS):
        """Get words from a text. Non reversible since the text
        is split on multiple characters, words are stripped of
        special characters and characters are converted to lower case.
        Useful to compute ratios, like the stopword ratio."""
        if strip_words and strip_characters is None:
            raise ValueError("strip_characters must be provided if strip_words is True.")
        words = FilteringFunctions.split_on_whitespace(text=text, new_line=True, tab=True)
        if lower_case:
            words = [word.lower() for word in words]
        if strip_words:
            words = [FilteringFunctions.strip(word, strip_characters) for word in words]
            words = FilteringFunctions.remove_empty_el_from_list(words)
        return words