_encode_text()

in src/tokenizers.js [2925:2977]


    _encode_text(text) {
        if (text === null) return null;

        // Actual function which does encoding, for a single text
        // First, we take care of special tokens. Needed to avoid issues arising from
        // normalization and/or pretokenization (which may not preserve special tokens)
        const sections = this.added_tokens_splitter.split(text);

        // Process left/right stripping of added tokens
        for (let i = 0; i < sections.length; ++i) {
            const addedToken = this.added_tokens_map.get(sections[i]);
            if (addedToken) {
                if (addedToken.lstrip && i > 0) {
                    sections[i - 1] = sections[i - 1].trimEnd();
                }
                if (addedToken.rstrip && i < sections.length - 1) {
                    sections[i + 1] = sections[i + 1].trimStart();
                }
            }
        }

        const tokens = sections.flatMap((x, section_index) => {
            if (x.length === 0) return [];
            if (this.added_tokens_map.has(x)) return [x]; // Return added tokens unchanged

            if (this.remove_space === true) {
                x = x.trim().split(/\s+/).join(' ');
            }
            if (this.do_lowercase_and_remove_accent) {
                x = lowercase_and_remove_accent(x);
            }

            if (this.normalizer !== null) {
                x = this.normalizer(x);
            }

            // If, after normalization, this section is empty (e.g., trimming whitespace),
            // we return an empty array
            if (x.length === 0) {
                return [];
            }

            const sectionTokens = (this.pre_tokenizer !== null) ? this.pre_tokenizer(x, {
                section_index,
            }) : [x];

            const tokens = this.model(sectionTokens);

            return tokens;
        });

        return tokens;
    }