def _load_vocab()

in eland/ml/pytorch/transformers.py [0:0]


    def _load_vocab(self) -> Dict[str, List[str]]:
        vocab_items = self._tokenizer.get_vocab().items()
        vocabulary = [k for k, _ in sorted(vocab_items, key=lambda kv: kv[1])]
        vocab_obj = {
            "vocabulary": vocabulary,
        }
        ranks = getattr(self._tokenizer, "bpe_ranks", {})
        if len(ranks) > 0:
            merges = [
                " ".join(m) for m, _ in sorted(ranks.items(), key=lambda kv: kv[1])
            ]
            vocab_obj["merges"] = merges

        if isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
            sp_model = self._tokenizer._tokenizer.spm
        else:
            sp_model = getattr(self._tokenizer, "sp_model", None)

        if sp_model:
            id_correction = getattr(self._tokenizer, "fairseq_offset", 0)
            scores = []
            for _ in range(0, id_correction):
                scores.append(0.0)
            for token_id in range(id_correction, len(vocabulary)):
                try:
                    scores.append(sp_model.get_score(token_id - id_correction))
                except IndexError:
                    scores.append(0.0)
                    pass
            if len(scores) > 0:
                vocab_obj["scores"] = scores
        return vocab_obj