in drqa/tokenizers/regexp_tokenizer.py [0:0]
def tokenize(self, text):
data = []
matches = [m for m in self._regexp.finditer(text)]
for i in range(len(matches)):
# Get text
token = matches[i].group()
# Make normalizations for special token types
if self.substitutions:
groups = matches[i].groupdict()
if groups['sdquote']:
token = "``"
elif groups['edquote']:
token = "''"
elif groups['ssquote']:
token = "`"
elif groups['esquote']:
token = "'"
elif groups['dash']:
token = '--'
elif groups['ellipses']:
token = '...'
# Get whitespace
span = matches[i].span()
start_ws = span[0]
if i + 1 < len(matches):
end_ws = matches[i + 1].span()[0]
else:
end_ws = span[1]
# Format data
data.append((
token,
text[start_ws: end_ws],
span,
))
return Tokens(data, self.annotators)