constructor()

in src/tokenizers.js [2577:2678]
65 lines of code
9 McCabe index (conditional complexity)

    constructor(tokenizerJSON, tokenizerConfig) {
        super();

        this.config = tokenizerConfig;

        // Construct parts of the tokenizer from the JSON
        this.normalizer = Normalizer.fromConfig(tokenizerJSON.normalizer);
        this.pre_tokenizer = PreTokenizer.fromConfig(tokenizerJSON.pre_tokenizer);
        this.model = TokenizerModel.fromConfig(tokenizerJSON.model, tokenizerConfig);
        this.post_processor = PostProcessor.fromConfig(tokenizerJSON.post_processor);
        this.decoder = Decoder.fromConfig(tokenizerJSON.decoder);

        // Add added_tokens to model
        this.special_tokens = [];
        this.all_special_ids = [];

        /** @type {AddedToken[]} */
        this.added_tokens = [];
        for (const addedToken of tokenizerJSON.added_tokens) {
            const token = new AddedToken(addedToken);
            this.added_tokens.push(token);

            this.model.tokens_to_ids.set(token.content, token.id);
            this.model.vocab[token.id] = token.content;

            if (token.special) {
                this.special_tokens.push(token.content);
                this.all_special_ids.push(token.id);
            }
        }

        // Update additional_special_tokens
        this.additional_special_tokens = tokenizerConfig.additional_special_tokens ?? [];
        this.special_tokens.push(...this.additional_special_tokens);
        this.special_tokens = [...new Set(this.special_tokens)]; // Remove duplicates

        if (this.decoder) {
            // Slight hack, but it prevents code duplication:
            this.decoder.added_tokens = this.added_tokens;

            // Another slight hack to add `end_of_word_suffix` (if present) to the decoder
            // This is needed for cases where BPE model and ByteLevel decoder are used
            // For more information, see https://github.com/huggingface/transformers.js/issues/74
            // TODO: save this to the decoder when exporting?
            this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
        }

        this.added_tokens_splitter = new DictionarySplitter(
            this.added_tokens.map(x => x.content),
        );

        /** @type {Map<string, AddedToken>} */
        this.added_tokens_map = new Map(this.added_tokens.map(x => [x.content, x]))

        // Set mask token if present (otherwise will be undefined, which is fine)
        this.mask_token = this.getToken('mask_token');
        this.mask_token_id = this.model.tokens_to_ids.get(this.mask_token);

        this.pad_token = this.getToken('pad_token', 'eos_token');
        this.pad_token_id = this.model.tokens_to_ids.get(this.pad_token);

        this.sep_token = this.getToken('sep_token');
        this.sep_token_id = this.model.tokens_to_ids.get(this.sep_token);

        this.unk_token = this.getToken('unk_token');
        this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);

        this.bos_token = this.getToken('bos_token');
        this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);

        this.eos_token = this.getToken('eos_token');
        this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);

        this.model_max_length = tokenizerConfig.model_max_length;

        /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
        this.remove_space = tokenizerConfig.remove_space;

        this.clean_up_tokenization_spaces = tokenizerConfig.clean_up_tokenization_spaces ?? true;
        this.do_lowercase_and_remove_accent = tokenizerConfig.do_lowercase_and_remove_accent ?? false;

        if (tokenizerConfig.padding_side) {
            this.padding_side = tokenizerConfig.padding_side;
        }

        this.legacy = false;

        this.chat_template = tokenizerConfig.chat_template ?? null;
        if (Array.isArray(this.chat_template)) {
            // Chat templates are stored as lists of dicts with fixed key names,
            // we reconstruct that into a single dict while loading them.
            const chat_template = Object.create(null);
            for (const { name, template } of this.chat_template) {
                if (typeof name !== 'string' || typeof template !== 'string') {
                    throw new Error('Chat template must be a list of objects with "name" and "template" properties');
                }
                chat_template[name] = template;
            }
            this.chat_template = chat_template;
        }
        this._compiled_template_cache = new Map();
    }