in utils_nlp/models/transformers/question_answering.py [0:0]
def _create_qa_example(qa_input, is_training):
""" Initial preprocessing to create _QAExample for feature extraction. """
# _QAExample is a data structure representing an unique document-question-answer
# triplet.
# Args:
# qa_id (int): An unique id identifying the document-question pair.
# This is used to map prediction results to ground truth answers
# during evaluation, because the data order is not preserved
# during pre-processing and post-processing.
# doc_tokens (list): White-space tokenized tokens of the document
# text. This is used to generate the final answer based on
# predicted start and end token indices during post-processing.
# question_text (str): Text of the question.
# orig_answer_text (str): Text of the ground truth answer if available.
# start_position (int): Index of the starting token of the answer
# span, if available.
# end_position (int): Index of the ending token of the answer span,
# if available.
# is_impossible (bool): If the question is impossible to answer based
# on the given document.
_QAExample = collections.namedtuple(
"_QAExample",
[
"qa_id",
"doc_tokens",
"question_text",
"orig_answer_text",
"start_position",
"end_position",
"is_impossible",
],
)
def _is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
return True
return False
d_text = qa_input.doc_text
q_text = qa_input.question_text
a_start = qa_input.answer_start
a_text = qa_input.answer_text
q_id = qa_input.qa_id
impossible = qa_input.is_impossible
d_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in d_text:
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
d_tokens.append(c)
else:
d_tokens[-1] += c
prev_is_whitespace = False
char_to_word_offset.append(len(d_tokens) - 1)
if _is_iterable_but_not_string(a_start):
if not _is_iterable_but_not_string(a_text):
raise Exception(
"The answer text must be a list when answer start is a list."
)
if len(a_start) != 1 and is_training and not impossible:
raise Exception("For training, each question should have exactly 1 answer.")
a_start = a_start[0]
a_text = a_text[0]
start_position = None
end_position = None
if is_training:
if not impossible:
answer_length = len(a_text)
start_position = char_to_word_offset[a_start]
end_position = char_to_word_offset[a_start + answer_length - 1]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning(
"Could not find answer: '%s' vs. '%s'",
actual_text,
cleaned_answer_text,
)
return
else:
start_position = -1
end_position = -1
return _QAExample(
qa_id=q_id,
doc_tokens=d_tokens,
question_text=q_text,
orig_answer_text=a_text,
start_position=start_position,
end_position=end_position,
is_impossible=impossible,
)