in src/datatrove/utils/word_tokenizers.py [0:0]
def load_tokenizer_assignments() -> dict[str, Callable[[], WordTokenizer]]:
def tok_factory_wrapper(class_name, arg):
if class_name == "SpaCyTokenizer":
tok_class = SpaCyTokenizer
elif class_name == "StanzaTokenizer":
tok_class = StanzaTokenizer
elif class_name == "ThaiTokenizer":
tok_class = ThaiTokenizer
elif class_name == "IndicNLPTokenizer":
tok_class = IndicNLPTokenizer
elif class_name == "KiwiTokenizer":
tok_class = KiwiTokenizer
elif class_name == "KhmerTokenizer":
tok_class = KhmerTokenizer
elif class_name == "LaoTokenizer":
tok_class = LaoTokenizer
elif class_name == "TibetanTokenizer":
tok_class = TibetanTokenizer
elif class_name == "BurmeseTokenizer":
tok_class = BurmeseTokenizer
elif class_name == "WhitespaceTokenizer":
tok_class = WhitespaceTokenizer
else:
raise ValueError(f'Invalid tokenizer class "{class_name}"')
if arg:
return tok_class(arg)
return tok_class()
word_tokenizer_factories = {}
with open(os.path.join(ASSETS_PATH, "tokenizer_assignment.csv")) as f:
reader = csv.DictReader(f)
for row in reader:
code_3, code_1, script, tok_class_name, tok_code, default_script, default_code_1 = (
row["code_3"],
row["code_1"],
row["script"],
row["type"],
row["tok_code"],
row["default_script"],
row["default_code_1"],
)
if not tok_class_name:
continue
tok_factory = partial(tok_factory_wrapper, tok_class_name, tok_code)
code_3_script = f"{code_3}_{script}"
if code_3_script not in word_tokenizer_factories:
word_tokenizer_factories[code_3_script] = tok_factory
if default_script:
word_tokenizer_factories[code_3] = tok_factory
code_1_script = f"{code_1}_{script}"
if code_1 and default_code_1 and code_1_script not in word_tokenizer_factories:
word_tokenizer_factories[code_1_script] = tok_factory
if default_script:
word_tokenizer_factories[code_1] = tok_factory
return word_tokenizer_factories