in distant_supervision/synthetic_data_creator.py [0:0]
def _compute_answer_start(self, *, answer_str, es_query, context):
within_sent_start_pos_lst = find_all(es_query, answer_str)
if not within_sent_start_pos_lst:
raise DsDatasetCreationError('Cannot find start position for answer="{}" in es_query="{}"'.format(
answer_str, es_query))
sentence_start_pos_lst = find_all(context, es_query) # should probably only have a single occurrence
if not sentence_start_pos_lst:
raise DsDatasetCreationError('Cannot find es_query="{}" in the following:\n{}'.format(
es_query, context))
start_pos_lst = []
for sentence_start_pos in sentence_start_pos_lst:
start_pos_lst.extend([pos + sentence_start_pos for pos in within_sent_start_pos_lst])
for pos in start_pos_lst:
# verify that it's correct
if context[pos:pos + len(answer_str)] != answer_str:
raise DsDatasetCreationError(
'inconsistent start_pos found {}'.format(
str(dict(
start_pos=pos,
sentence_start_pos_lst=sentence_start_pos_lst,
within_sent_start_pos_lst=within_sent_start_pos_lst,
answer_str=answer_str,
es_query=es_query,
context=context,
))))
return start_pos_lst