def _create_qa_example()

in utils_nlp/models/transformers/question_answering.py [0:0]
73 lines of code
23 McCabe index (conditional complexity)

def _create_qa_example(qa_input, is_training):
    """ Initial preprocessing to create _QAExample for feature extraction. """

    # _QAExample is a data structure representing an unique document-question-answer
    #   triplet.
    # Args:
    #     qa_id (int): An unique id identifying the document-question pair.
    #         This is used to map prediction results to ground truth answers
    #         during evaluation, because the data order is not preserved
    #         during pre-processing and post-processing.
    #     doc_tokens (list): White-space tokenized tokens of the document
    #         text. This is used to generate the final answer based on
    #         predicted start and end token indices during post-processing.
    #     question_text (str): Text of the question.
    #     orig_answer_text (str): Text of the ground truth answer if available.
    #     start_position (int): Index of the starting token of the answer
    #         span, if available.
    #     end_position (int): Index of the ending token of the answer span,
    #         if available.
    #     is_impossible (bool): If the question is impossible to answer based
    #         on the given document.
    _QAExample = collections.namedtuple(
        "_QAExample",
        [
            "qa_id",
            "doc_tokens",
            "question_text",
            "orig_answer_text",
            "start_position",
            "end_position",
            "is_impossible",
        ],
    )

    def _is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    d_text = qa_input.doc_text
    q_text = qa_input.question_text
    a_start = qa_input.answer_start
    a_text = qa_input.answer_text
    q_id = qa_input.qa_id
    impossible = qa_input.is_impossible

    d_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in d_text:
        if _is_whitespace(c):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                d_tokens.append(c)
            else:
                d_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(d_tokens) - 1)

    if _is_iterable_but_not_string(a_start):
        if not _is_iterable_but_not_string(a_text):
            raise Exception(
                "The answer text must be a list when answer start is a list."
            )
        if len(a_start) != 1 and is_training and not impossible:
            raise Exception("For training, each question should have exactly 1 answer.")
        a_start = a_start[0]
        a_text = a_text[0]

    start_position = None
    end_position = None
    if is_training:
        if not impossible:
            answer_length = len(a_text)
            start_position = char_to_word_offset[a_start]
            end_position = char_to_word_offset[a_start + answer_length - 1]
            # Only add answers where the text can be exactly recovered from the
            # document. If this CAN'T happen it's likely due to weird Unicode
            # stuff so we will just skip the example.
            #
            # Note that this means for training mode, every example is NOT
            # guaranteed to be preserved.
            actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
            if actual_text.find(cleaned_answer_text) == -1:
                logger.warning(
                    "Could not find answer: '%s' vs. '%s'",
                    actual_text,
                    cleaned_answer_text,
                )
                return
        else:
            start_position = -1
            end_position = -1

    return _QAExample(
        qa_id=q_id,
        doc_tokens=d_tokens,
        question_text=q_text,
        orig_answer_text=a_text,
        start_position=start_position,
        end_position=end_position,
        is_impossible=impossible,
    )