def _tokenize_function()

in src/train_ner.py [0:0]


    def _tokenize_function(self, example):
        # tokenize and truncate text
        # self.tokenizer.truncation_side = "right"
        tokenized_inputs = self.tokenizer(
            example['tokens'],
            is_split_into_words=True,
            truncation=True,
            padding='max_length',
            max_length=64,
        )
        word_ids = tokenized_inputs.word_ids()
        aligned_labels = []

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # Special tokens ([CLS], [SEP], etc.)
            elif word_idx != previous_word_idx:
                aligned_labels.append(example['ner_tags'][word_idx])  # Assign the label to the first token of each word
            else:
                aligned_labels.append(-100)  # Subword tokens get label -100

            previous_word_idx = word_idx

        tokenized_inputs["labels"] = aligned_labels
        return tokenized_inputs