in scripts/reader/preprocess.py [0:0]
def process_dataset(data, tokenizer, workers=None):
"""Iterate processing (tokenize, parse, etc) dataset multithreaded."""
tokenizer_class = tokenizers.get_class(tokenizer)
make_pool = partial(Pool, workers, initializer=init)
workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
q_tokens = workers.map(tokenize, data['questions'])
workers.close()
workers.join()
workers = make_pool(
initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
)
c_tokens = workers.map(tokenize, data['contexts'])
workers.close()
workers.join()
for idx in range(len(data['qids'])):
question = q_tokens[idx]['words']
qlemma = q_tokens[idx]['lemma']
document = c_tokens[data['qid2cid'][idx]]['words']
offsets = c_tokens[data['qid2cid'][idx]]['offsets']
lemma = c_tokens[data['qid2cid'][idx]]['lemma']
pos = c_tokens[data['qid2cid'][idx]]['pos']
ner = c_tokens[data['qid2cid'][idx]]['ner']
ans_tokens = []
if len(data['answers']) > 0:
for ans in data['answers'][idx]:
found = find_answer(offsets,
ans['answer_start'],
ans['answer_start'] + len(ans['text']))
if found:
ans_tokens.append(found)
yield {
'id': data['qids'][idx],
'question': question,
'document': document,
'offsets': offsets,
'answers': ans_tokens,
'qlemma': qlemma,
'lemma': lemma,
'pos': pos,
'ner': ner,
}