in scripts/distant/generate.py [0:0]
def process(questions, answers, outfile, opts):
"""Generate examples for all questions."""
logger.info('Processing %d question answer pairs...' % len(questions))
logger.info('Will save to %s.dstrain and %s.dsdev' % (outfile, outfile))
# Load ranker
ranker = opts['ranker_class'](strict=False)
logger.info('Ranking documents (top %d per question)...' % opts['n_docs'])
ranked = ranker.batch_closest_docs(questions, k=opts['n_docs'])
ranked = [r[0] for r in ranked]
# Start pool of tokenizers with ner enabled
workers = Pool(opts['workers'], initializer=init,
initargs=(opts['tokenizer_class'], {'annotators': {'ner'}}))
logger.info('Pre-tokenizing questions...')
q_tokens = workers.map(tokenize_text, questions)
q_ner = workers.map(nltk_entity_groups, questions)
q_tokens = list(zip(q_tokens, q_ner))
workers.close()
workers.join()
# Start pool of simple tokenizers + db connections
workers = Pool(opts['workers'], initializer=init,
initargs=(opts['tokenizer_class'], {},
opts['db_class'], {}))
logger.info('Searching documents...')
cnt = 0
inputs = [(ranked[i], q_tokens[i], answers[i]) for i in range(len(ranked))]
search_fn = partial(search_docs, max_ex=opts['max_ex'], opts=opts['search'])
with open(outfile + '.dstrain', 'w') as f_train, \
open(outfile + '.dsdev', 'w') as f_dev:
for res in workers.imap_unordered(search_fn, inputs):
for ex in res:
cnt += 1
f = f_dev if random.random() < opts['dev_split'] else f_train
f.write(json.dumps(ex))
f.write('\n')
if cnt % 1000 == 0:
logging.info('%d results so far...' % cnt)
workers.close()
workers.join()
logging.info('Finished. Total = %d' % cnt)