in lambdachecker/sms_spam_classifier_utilities.py [0:0]
def text_to_word_sequence(text,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True, split=" "):
"""Converts a text to a sequence of words (or tokens).
# Arguments
text: Input text (string).
filters: list (or concatenation) of characters to filter out, such as
punctuation. Default: `!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`,
includes basic punctuation, tabs, and newlines.
lower: boolean. Whether to convert the input to lowercase.
split: str. Separator for word splitting.
# Returns
A list of words (or tokens).
"""
if lower:
text = text.lower()
if sys.version_info < (3,):
if isinstance(text, unicode):
translate_map = dict((ord(c), unicode(split)) for c in filters)
text = text.translate(translate_map)
elif len(split) == 1:
translate_map = maketrans(filters, split * len(filters))
text = text.translate(translate_map)
else:
for c in filters:
text = text.replace(c, split)
else:
translate_dict = dict((c, split) for c in filters)
translate_map = maketrans(translate_dict)
text = text.translate(translate_map)
seq = text.split(split)
return [i for i in seq if i]