in src/train_ner.py [0:0]
def _tokenize_function(self, example):
# tokenize and truncate text
# self.tokenizer.truncation_side = "right"
tokenized_inputs = self.tokenizer(
example['tokens'],
is_split_into_words=True,
truncation=True,
padding='max_length',
max_length=64,
)
word_ids = tokenized_inputs.word_ids()
aligned_labels = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
aligned_labels.append(-100) # Special tokens ([CLS], [SEP], etc.)
elif word_idx != previous_word_idx:
aligned_labels.append(example['ner_tags'][word_idx]) # Assign the label to the first token of each word
else:
aligned_labels.append(-100) # Subword tokens get label -100
previous_word_idx = word_idx
tokenized_inputs["labels"] = aligned_labels
return tokenized_inputs