in iep/preprocess.py [0:0]
def tokenize(s, delim=' ',
add_start_token=True, add_end_token=True,
punct_to_keep=None, punct_to_remove=None):
"""
Tokenize a sequence, converting a string s into a list of (string) tokens by
splitting on the specified delimiter. Optionally keep or remove certain
punctuation marks and add start and end tokens.
"""
if punct_to_keep is not None:
for p in punct_to_keep:
s = s.replace(p, '%s%s' % (delim, p))
if punct_to_remove is not None:
for p in punct_to_remove:
s = s.replace(p, '')
tokens = s.split(delim)
if add_start_token:
tokens.insert(0, '<START>')
if add_end_token:
tokens.append('<END>')
return tokens