in parsers/MovieReview/MovieReview_Finetune_Preprocess.py [0:0]
def _convert_examples_to_features(examples, seq_length, tokenizer):
"""Loads a data file into a list of `InputFeature`s."""
# This variable holds a list of examples. Each example is a list of sentences in the form of "features"
dataset_features = []
example_no = 0
for example in examples:
# get example unique ID
example_unique_id = example.unique_id
# get target label associated to the document
example_target = example.target
# get the sentences
sentences = example.text_a # text_b always None
# istantiate a list of features, one per sentence
example_features = []
# The parsed sentence with <pos> and <neg> tags recombined
parsed_example = []
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens_sentence = [] # the tokens with "corrected" annotation placeholders
# ------ Finite State Machine to replace specific substrings ------ #
left_out_tokens = []
possible_match = False
for token in tokens:
if not possible_match:
if token == '<':
possible_match = True # start tracking possible tag
left_out_tokens.append(token)
else:
parsed_example.append(token)
else:
if left_out_tokens == ['<'] and token in ['/', 'ne', 'po'] or \
left_out_tokens == ['<', '/'] and token in ['ne'] or \
left_out_tokens == ['<', '/'] and token in ['po'] or \
left_out_tokens == ['<', 'po'] and token in ['##s'] or \
left_out_tokens == ['<', 'ne'] and token in ['##g'] or \
left_out_tokens == ['<', '/', 'po'] and token in ['##s'] or \
left_out_tokens == ['<', '/', 'ne'] and token in ['##g']:
left_out_tokens.append(token)
elif left_out_tokens == ['<', '/', 'po', '##s'] and token == '>':
parsed_example.append('</pos>')
possible_match = False
left_out_tokens = []
elif left_out_tokens == ['<', 'po', '##s'] and token == '>':
parsed_example.append('<pos>')
possible_match = False
left_out_tokens = []
elif left_out_tokens == ['<', '/', 'ne', '##g'] and token == '>':
parsed_example.append('</neg>')
possible_match = False
left_out_tokens = []
elif left_out_tokens == ['<', 'ne', '##g'] and token == '>':
parsed_example.append('<neg>')
possible_match = False
left_out_tokens = []
else:
parsed_example.extend([t for t in left_out_tokens])
possible_match = False
left_out_tokens = []
# ----------------- End of finite state machine ------------------ #
# Account for [CLS] and [SEP] with "- 2"
if len(parsed_example) > seq_length - 3:
parsed_example = parsed_example[0:(seq_length - 3)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
# We now prepare the data for BERT
annotate_as_neg = False # Needed to associate an annotation to each token
annotate_as_pos = False # Needed to associate an annotation to each token
input_type_ids = []
annotations = []
tokens = []
tokens.append("[CLS]")
input_type_ids.append(0)
annotations.append(0)
for token in parsed_example:
if token == '<neg>':
# print(f'found {token}!')
assert not annotate_as_pos
annotate_as_neg = True
elif token == '<pos>':
# print(f'found {token}!')
assert not annotate_as_neg
annotate_as_pos = True
elif token == '</neg>':
# print(f'found {token}!')
assert annotate_as_neg
assert not annotate_as_pos
annotate_as_neg = False
elif token == '</pos>':
# print(f'found {token}!')
assert annotate_as_pos, sentence
assert not annotate_as_neg
annotate_as_pos = False
else:
if annotate_as_neg or annotate_as_pos:
annotations.append(1)
else:
annotations.append(0)
tokens.append(token)
input_type_ids.append(0)
tokens.append("[SEP]")
input_type_ids.append(0)
annotations.append(0)
# we also create a sentence ID, it may be useful
sentence_unique_id = example_unique_id
# THIS CREATES THE BERT REAL INPUT
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < seq_length:
input_ids.append(0)
input_mask.append(0)
input_type_ids.append(0)
# print(len(input_ids), len(input_mask), len(input_type_ids))
assert len(input_ids) == seq_length, (len(input_ids), seq_length)
assert len(input_mask) == seq_length, (len(input_mask), seq_length)
assert len(input_type_ids) == seq_length, (len(input_type_ids), seq_length)
assert len(tokens) == len(annotations), (len(tokens), len(annotations))
# print(f'Sentence unique id is {sentence_unique_id}')
example_features.append(
MovieReviewInputFeatures(
unique_example_id=example_unique_id,
unique_sentence_id=sentence_unique_id,
tokens=tokens,
annotations=annotations,
input_ids=input_ids,
input_mask=input_mask,
input_type_ids=input_type_ids)
)
sentence_unique_id += 1
dataset_features.append((example_target, example_features))
example_no +=1
print(f'Parsed example {example_no}')
return dataset_features