in drqa/tokenizers/corenlp_tokenizer.py [0:0]
def tokenize(self, text):
# Since we're feeding text to the commandline, we're waiting on seeing
# the NLP> prompt. Hacky!
if 'NLP>' in text:
raise RuntimeError('Bad token (NLP>) in text!')
# Sending q will cause the process to quit -- manually override
if text.lower().strip() == 'q':
token = text.strip()
index = text.index(token)
data = [(token, text[index:], (index, index + 1), 'NN', 'q', 'O')]
return Tokens(data, self.annotators)
# Minor cleanup before tokenizing.
clean_text = text.replace('\n', ' ')
self.corenlp.sendline(clean_text.encode('utf-8'))
self.corenlp.expect_exact('NLP>', searchwindowsize=100)
# Skip to start of output (may have been stderr logging messages)
output = self.corenlp.before
start = output.find(b'{"sentences":')
output = json.loads(output[start:].decode('utf-8'))
data = []
tokens = [t for s in output['sentences'] for t in s['tokens']]
for i in range(len(tokens)):
# Get whitespace
start_ws = tokens[i]['characterOffsetBegin']
if i + 1 < len(tokens):
end_ws = tokens[i + 1]['characterOffsetBegin']
else:
end_ws = tokens[i]['characterOffsetEnd']
data.append((
self._convert(tokens[i]['word']),
text[start_ws: end_ws],
(tokens[i]['characterOffsetBegin'],
tokens[i]['characterOffsetEnd']),
tokens[i].get('pos', None),
tokens[i].get('lemma', None),
tokens[i].get('ner', None)
))
return Tokens(data, self.annotators)