in distant_supervision/synthetic_data_creator.py [0:0]
def _construct_dataset_sample(self, *, qpa, hit, hit_phrases, es_query, es_rank, backfill_article, backfill_sent, rng):
qid = random_str(16)
context = qpa.article_raw
answer_str = qpa.phrase.phrase_str
answer_start_lst = self._compute_answer_start(
answer_str=answer_str,
es_query=es_query,
context=context)
if len(answer_start_lst) == 0:
raise DsDatasetCreationError('Did not find any answer_start answer={}: {}\n{}'.format(
answer_str, es_query, context))
answers = [{'text': answer_str, 'answer_start': pos} for pos in answer_start_lst]
styled_questions = self._make_styled_questions(
qpa=qpa,
es_hit=hit,
answer_str=answer_str,
rng=rng)
datum = DsDatum(
qid=qid,
styled_questions=styled_questions,
answers=answers,
context=context)
datum.meta = {
"es_query": es_query,
"es_rank": es_rank,
"es_score": hit['_score'],
"answer_phrase_category": qpa.phrase.phrase_category,
"context": {
"article_id": qpa.article_id,
"article_title": qpa.article_title,
},
"question": {
"article_id": int(hit['_source']['article_id']),
"article_title": hit['_source']['article_title'],
"phrases": hit_phrases,
},
"backfill_nb_articles": backfill_article,
"backfill_nb_sents": backfill_sent,
}
return datum