in src/tokenizers.js [2925:2977]
_encode_text(text) {
if (text === null) return null;
// Actual function which does encoding, for a single text
// First, we take care of special tokens. Needed to avoid issues arising from
// normalization and/or pretokenization (which may not preserve special tokens)
const sections = this.added_tokens_splitter.split(text);
// Process left/right stripping of added tokens
for (let i = 0; i < sections.length; ++i) {
const addedToken = this.added_tokens_map.get(sections[i]);
if (addedToken) {
if (addedToken.lstrip && i > 0) {
sections[i - 1] = sections[i - 1].trimEnd();
}
if (addedToken.rstrip && i < sections.length - 1) {
sections[i + 1] = sections[i + 1].trimStart();
}
}
}
const tokens = sections.flatMap((x, section_index) => {
if (x.length === 0) return [];
if (this.added_tokens_map.has(x)) return [x]; // Return added tokens unchanged
if (this.remove_space === true) {
x = x.trim().split(/\s+/).join(' ');
}
if (this.do_lowercase_and_remove_accent) {
x = lowercase_and_remove_accent(x);
}
if (this.normalizer !== null) {
x = this.normalizer(x);
}
// If, after normalization, this section is empty (e.g., trimming whitespace),
// we return an empty array
if (x.length === 0) {
return [];
}
const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, {
section_index,
}) : [x];
const tokens = this.model(sectionTokens);
return tokens;
});
return tokens;
}