def process()

in scripts/distant/generate.py [0:0]


def process(questions, answers, outfile, opts):
    """Generate examples for all questions."""
    logger.info('Processing %d question answer pairs...' % len(questions))
    logger.info('Will save to %s.dstrain and %s.dsdev' % (outfile, outfile))

    # Load ranker
    ranker = opts['ranker_class'](strict=False)
    logger.info('Ranking documents (top %d per question)...' % opts['n_docs'])
    ranked = ranker.batch_closest_docs(questions, k=opts['n_docs'])
    ranked = [r[0] for r in ranked]

    # Start pool of tokenizers with ner enabled
    workers = Pool(opts['workers'], initializer=init,
                   initargs=(opts['tokenizer_class'], {'annotators': {'ner'}}))

    logger.info('Pre-tokenizing questions...')
    q_tokens = workers.map(tokenize_text, questions)
    q_ner = workers.map(nltk_entity_groups, questions)
    q_tokens = list(zip(q_tokens, q_ner))
    workers.close()
    workers.join()

    # Start pool of simple tokenizers + db connections
    workers = Pool(opts['workers'], initializer=init,
                   initargs=(opts['tokenizer_class'], {},
                             opts['db_class'], {}))

    logger.info('Searching documents...')
    cnt = 0
    inputs = [(ranked[i], q_tokens[i], answers[i]) for i in range(len(ranked))]
    search_fn = partial(search_docs, max_ex=opts['max_ex'], opts=opts['search'])
    with open(outfile + '.dstrain', 'w') as f_train, \
         open(outfile + '.dsdev', 'w') as f_dev:
        for res in workers.imap_unordered(search_fn, inputs):
            for ex in res:
                cnt += 1
                f = f_dev if random.random() < opts['dev_split'] else f_train
                f.write(json.dumps(ex))
                f.write('\n')
                if cnt % 1000 == 0:
                    logging.info('%d results so far...' % cnt)
    workers.close()
    workers.join()
    logging.info('Finished. Total = %d' % cnt)