def load_tokenizer_assignments()

in src/datatrove/utils/word_tokenizers.py [0:0]


def load_tokenizer_assignments() -> dict[str, Callable[[], WordTokenizer]]:
    def tok_factory_wrapper(class_name, arg):
        if class_name == "SpaCyTokenizer":
            tok_class = SpaCyTokenizer
        elif class_name == "StanzaTokenizer":
            tok_class = StanzaTokenizer
        elif class_name == "ThaiTokenizer":
            tok_class = ThaiTokenizer
        elif class_name == "IndicNLPTokenizer":
            tok_class = IndicNLPTokenizer
        elif class_name == "KiwiTokenizer":
            tok_class = KiwiTokenizer
        elif class_name == "KhmerTokenizer":
            tok_class = KhmerTokenizer
        elif class_name == "LaoTokenizer":
            tok_class = LaoTokenizer
        elif class_name == "TibetanTokenizer":
            tok_class = TibetanTokenizer
        elif class_name == "BurmeseTokenizer":
            tok_class = BurmeseTokenizer
        elif class_name == "WhitespaceTokenizer":
            tok_class = WhitespaceTokenizer
        else:
            raise ValueError(f'Invalid tokenizer class "{class_name}"')

        if arg:
            return tok_class(arg)
        return tok_class()

    word_tokenizer_factories = {}
    with open(os.path.join(ASSETS_PATH, "tokenizer_assignment.csv")) as f:
        reader = csv.DictReader(f)
        for row in reader:
            code_3, code_1, script, tok_class_name, tok_code, default_script, default_code_1 = (
                row["code_3"],
                row["code_1"],
                row["script"],
                row["type"],
                row["tok_code"],
                row["default_script"],
                row["default_code_1"],
            )

            if not tok_class_name:
                continue

            tok_factory = partial(tok_factory_wrapper, tok_class_name, tok_code)

            code_3_script = f"{code_3}_{script}"
            if code_3_script not in word_tokenizer_factories:
                word_tokenizer_factories[code_3_script] = tok_factory
                if default_script:
                    word_tokenizer_factories[code_3] = tok_factory
            code_1_script = f"{code_1}_{script}"
            if code_1 and default_code_1 and code_1_script not in word_tokenizer_factories:
                word_tokenizer_factories[code_1_script] = tok_factory
                if default_script:
                    word_tokenizer_factories[code_1] = tok_factory

    return word_tokenizer_factories