in eland/ml/pytorch/transformers.py [0:0]
def _create_tokenization_config(self) -> NlpTokenizationConfig:
if self._max_model_input_size:
_max_sequence_length = self._max_model_input_size
else:
_max_sequence_length = self._find_max_sequence_length()
if isinstance(self._tokenizer, transformers.MPNetTokenizer):
return NlpMPNetTokenizationConfig(
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
max_sequence_length=_max_sequence_length,
)
elif isinstance(
self._tokenizer, (transformers.RobertaTokenizer, transformers.BartTokenizer)
):
return NlpRobertaTokenizationConfig(
add_prefix_space=getattr(self._tokenizer, "add_prefix_space", None),
max_sequence_length=_max_sequence_length,
)
elif isinstance(self._tokenizer, transformers.XLMRobertaTokenizer):
return NlpXLMRobertaTokenizationConfig(
max_sequence_length=_max_sequence_length
)
elif isinstance(self._tokenizer, transformers.DebertaV2Tokenizer):
return NlpDebertaV2TokenizationConfig(
max_sequence_length=_max_sequence_length,
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
)
else:
japanese_morphological_tokenizers = ["mecab"]
if (
hasattr(self._tokenizer, "word_tokenizer_type")
and self._tokenizer.word_tokenizer_type
in japanese_morphological_tokenizers
):
return NlpBertJapaneseTokenizationConfig(
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
max_sequence_length=_max_sequence_length,
)
else:
return NlpBertTokenizationConfig(
do_lower_case=getattr(self._tokenizer, "do_lower_case", None),
max_sequence_length=_max_sequence_length,
)