def _convert_examples_to_features()

in parsers/MovieReview/MovieReview_Finetune_Preprocess.py [0:0]


def _convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputFeature`s."""

    # This variable holds a list of examples. Each example is a list of sentences in the form of "features"
    dataset_features = []
    example_no = 0
    for example in examples:
        # get example unique ID
        example_unique_id = example.unique_id

        # get target label associated to the document
        example_target = example.target

        # get the sentences
        sentences = example.text_a  # text_b always None

        # istantiate a list of features, one per sentence
        example_features = []

        # The parsed sentence with <pos> and <neg> tags recombined
        parsed_example = []

        for sentence in sentences:
            tokens = tokenizer.tokenize(sentence)

            tokens_sentence = []  # the tokens with "corrected" annotation placeholders

            # ------ Finite State Machine to replace specific substrings  ------ #
            left_out_tokens = []
            possible_match = False

            for token in tokens:
                if not possible_match:
                    if token == '<':
                        possible_match = True  # start tracking possible tag
                        left_out_tokens.append(token)
                    else:
                        parsed_example.append(token)
                else:
                    if left_out_tokens == ['<'] and token in ['/', 'ne', 'po'] or \
                            left_out_tokens == ['<', '/'] and token in ['ne'] or \
                            left_out_tokens == ['<', '/'] and token in ['po'] or \
                            left_out_tokens == ['<', 'po'] and token in ['##s'] or \
                            left_out_tokens == ['<', 'ne'] and token in ['##g'] or \
                            left_out_tokens == ['<', '/', 'po'] and token in ['##s'] or \
                            left_out_tokens == ['<', '/', 'ne'] and token in ['##g']:
                        left_out_tokens.append(token)
                    elif left_out_tokens == ['<', '/', 'po', '##s'] and token == '>':
                        parsed_example.append('</pos>')
                        possible_match = False
                        left_out_tokens = []
                    elif left_out_tokens == ['<', 'po', '##s'] and token == '>':
                        parsed_example.append('<pos>')
                        possible_match = False
                        left_out_tokens = []
                    elif left_out_tokens == ['<', '/', 'ne', '##g'] and token == '>':
                        parsed_example.append('</neg>')
                        possible_match = False
                        left_out_tokens = []
                    elif left_out_tokens == ['<', 'ne', '##g'] and token == '>':
                        parsed_example.append('<neg>')
                        possible_match = False
                        left_out_tokens = []
                    else:
                        parsed_example.extend([t for t in left_out_tokens])
                        possible_match = False
                        left_out_tokens = []

            # ----------------- End of finite state machine ------------------ #

            # Account for [CLS] and [SEP] with "- 2"
            if len(parsed_example) > seq_length - 3:
                parsed_example = parsed_example[0:(seq_length - 3)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.

        # We now prepare the data for BERT

        annotate_as_neg = False  # Needed to associate an annotation to each token
        annotate_as_pos = False  # Needed to associate an annotation to each token

        input_type_ids = []
        annotations = []
        tokens = []

        tokens.append("[CLS]")
        input_type_ids.append(0)
        annotations.append(0)

        for token in parsed_example:
            if token == '<neg>':
                # print(f'found {token}!')
                assert not annotate_as_pos
                annotate_as_neg = True
            elif token == '<pos>':
                # print(f'found {token}!')
                assert not annotate_as_neg
                annotate_as_pos = True
            elif token == '</neg>':
                # print(f'found {token}!')
                assert annotate_as_neg
                assert not annotate_as_pos
                annotate_as_neg = False
            elif token == '</pos>':
                # print(f'found {token}!')
                assert annotate_as_pos, sentence
                assert not annotate_as_neg
                annotate_as_pos = False
            else:
                if annotate_as_neg or annotate_as_pos:
                    annotations.append(1)
                else:
                    annotations.append(0)
                tokens.append(token)
                input_type_ids.append(0)

        tokens.append("[SEP]")
        input_type_ids.append(0)
        annotations.append(0)

        # we also create a sentence ID, it may be useful
        sentence_unique_id = example_unique_id

        # THIS CREATES THE BERT REAL INPUT
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)

        # print(len(input_ids), len(input_mask), len(input_type_ids))

        assert len(input_ids) == seq_length, (len(input_ids), seq_length)
        assert len(input_mask) == seq_length, (len(input_mask), seq_length)
        assert len(input_type_ids) == seq_length, (len(input_type_ids), seq_length)
        assert len(tokens) == len(annotations), (len(tokens), len(annotations))

        # print(f'Sentence unique id is {sentence_unique_id}')
        example_features.append(
            MovieReviewInputFeatures(
                unique_example_id=example_unique_id,
                unique_sentence_id=sentence_unique_id,
                tokens=tokens,
                annotations=annotations,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids)
        )
        sentence_unique_id += 1

        dataset_features.append((example_target, example_features))

        example_no +=1
        print(f'Parsed example {example_no}')
    return dataset_features