in torchtext/data/utils.py [0:0]
def get_tokenizer(tokenizer, language='en'):
r"""
Generate tokenizer function for a string sentence.
Args:
tokenizer: the name of tokenizer function. If None, it returns split()
function, which splits the string sentence by space.
If basic_english, it returns _basic_english_normalize() function,
which normalize the string first and split by space. If a callable
function, it will return the function. If a tokenizer library
(e.g. spacy, moses, toktok, revtok, subword), it returns the
corresponding library.
language: Default en
Examples:
>>> import torchtext
>>> from torchtext.data import get_tokenizer
>>> tokenizer = get_tokenizer("basic_english")
>>> tokens = tokenizer("You can now install TorchText using pip!")
>>> tokens
>>> ['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']
"""
# default tokenizer is string.split(), added as a module function for serialization
if tokenizer is None:
return _split_tokenizer
if tokenizer == "basic_english":
if language != 'en':
raise ValueError("Basic normalization is only available for Enlish(en)")
return _basic_english_normalize
# simply return if a function is passed
if callable(tokenizer):
return tokenizer
if tokenizer == "spacy":
try:
import spacy
try:
spacy = spacy.load(language)
except IOError:
# Model shortcuts no longer work in spaCy 3.0+, try using fullnames
# List is from https://github.com/explosion/spaCy/blob/b903de3fcb56df2f7247e5b6cfa6b66f4ff02b62/spacy/errors.py#L789
OLD_MODEL_SHORTCUTS = spacy.errors.OLD_MODEL_SHORTCUTS if hasattr(spacy.errors, 'OLD_MODEL_SHORTCUTS') else {}
if language not in OLD_MODEL_SHORTCUTS:
raise
import warnings
warnings.warn(f'Spacy model "{language}" could not be loaded, trying "{OLD_MODEL_SHORTCUTS[language]}" instead')
spacy = spacy.load(OLD_MODEL_SHORTCUTS[language])
return partial(_spacy_tokenize, spacy=spacy)
except ImportError:
print("Please install SpaCy. "
"See the docs at https://spacy.io for more information.")
raise
except AttributeError:
print("Please install SpaCy and the SpaCy {} tokenizer. "
"See the docs at https://spacy.io for more "
"information.".format(language))
raise
elif tokenizer == "moses":
try:
from sacremoses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install SacreMoses. "
"See the docs at https://github.com/alvations/sacremoses "
"for more information.")
raise
elif tokenizer == "toktok":
try:
from nltk.tokenize.toktok import ToktokTokenizer
toktok = ToktokTokenizer()
return toktok.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at https://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
import revtok
return revtok.tokenize
except ImportError:
print("Please install revtok.")
raise
elif tokenizer == 'subword':
try:
import revtok
return partial(revtok.tokenize, decap=True)
except ImportError:
print("Please install revtok.")
raise
raise ValueError("Requested tokenizer {}, valid choices are a "
"callable that takes a single string as input, "
"\"revtok\" for the revtok reversible tokenizer, "
"\"subword\" for the revtok caps-aware tokenizer, "
"\"spacy\" for the SpaCy English tokenizer, or "
"\"moses\" for the NLTK port of the Moses tokenization "
"script.".format(tokenizer))