in src/tokenizers.js [2577:2678]
constructor(tokenizerJSON, tokenizerConfig) {
super();
this.config = tokenizerConfig;
// Construct parts of the tokenizer from the JSON
this.normalizer = Normalizer.fromConfig(tokenizerJSON.normalizer);
this.pre_tokenizer = PreTokenizer.fromConfig(tokenizerJSON.pre_tokenizer);
this.model = TokenizerModel.fromConfig(tokenizerJSON.model, tokenizerConfig);
this.post_processor = PostProcessor.fromConfig(tokenizerJSON.post_processor);
this.decoder = Decoder.fromConfig(tokenizerJSON.decoder);
// Add added_tokens to model
this.special_tokens = [];
this.all_special_ids = [];
/** @type {AddedToken[]} */
this.added_tokens = [];
for (const addedToken of tokenizerJSON.added_tokens) {
const token = new AddedToken(addedToken);
this.added_tokens.push(token);
this.model.tokens_to_ids.set(token.content, token.id);
this.model.vocab[token.id] = token.content;
if (token.special) {
this.special_tokens.push(token.content);
this.all_special_ids.push(token.id);
}
}
// Update additional_special_tokens
this.additional_special_tokens = tokenizerConfig.additional_special_tokens ?? [];
this.special_tokens.push(...this.additional_special_tokens);
this.special_tokens = [...new Set(this.special_tokens)]; // Remove duplicates
if (this.decoder) {
// Slight hack, but it prevents code duplication:
this.decoder.added_tokens = this.added_tokens;
// Another slight hack to add `end_of_word_suffix` (if present) to the decoder
// This is needed for cases where BPE model and ByteLevel decoder are used
// For more information, see https://github.com/huggingface/transformers.js/issues/74
// TODO: save this to the decoder when exporting?
this.decoder.end_of_word_suffix = this.model.end_of_word_suffix;
}
this.added_tokens_splitter = new DictionarySplitter(
this.added_tokens.map(x => x.content),
);
/** @type {Map<string, AddedToken>} */
this.added_tokens_map = new Map(this.added_tokens.map(x => [x.content, x]))
// Set mask token if present (otherwise will be undefined, which is fine)
this.mask_token = this.getToken('mask_token');
this.mask_token_id = this.model.tokens_to_ids.get(this.mask_token);
this.pad_token = this.getToken('pad_token', 'eos_token');
this.pad_token_id = this.model.tokens_to_ids.get(this.pad_token);
this.sep_token = this.getToken('sep_token');
this.sep_token_id = this.model.tokens_to_ids.get(this.sep_token);
this.unk_token = this.getToken('unk_token');
this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
this.bos_token = this.getToken('bos_token');
this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
this.eos_token = this.getToken('eos_token');
this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
this.model_max_length = tokenizerConfig.model_max_length;
/** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
this.remove_space = tokenizerConfig.remove_space;
this.clean_up_tokenization_spaces = tokenizerConfig.clean_up_tokenization_spaces ?? true;
this.do_lowercase_and_remove_accent = tokenizerConfig.do_lowercase_and_remove_accent ?? false;
if (tokenizerConfig.padding_side) {
this.padding_side = tokenizerConfig.padding_side;
}
this.legacy = false;
this.chat_template = tokenizerConfig.chat_template ?? null;
if (Array.isArray(this.chat_template)) {
// Chat templates are stored as lists of dicts with fixed key names,
// we reconstruct that into a single dict while loading them.
const chat_template = Object.create(null);
for (const { name, template } of this.chat_template) {
if (typeof name !== 'string' || typeof template !== 'string') {
throw new Error('Chat template must be a list of objects with "name" and "template" properties');
}
chat_template[name] = template;
}
this.chat_template = chat_template;
}
this._compiled_template_cache = new Map();
}