bpe()

in src/tokenizers.js [761:885]


    bpe(token) {
        if (token.length === 0) {
            return [];
        }

        const cached = this.cache.get(token);
        if (cached !== undefined) {
            return cached;
        }

        const word = Array.from(token);
        if (this.end_of_word_suffix) {
            word[word.length - 1] += this.end_of_word_suffix;
        }

        let result = [];
        if (word.length > 1) {
            // Create a priority queue to store the nodes that will be merged.
            // The comparator function compares the scores of the nodes.
            const queue = new PriorityQueue((a, b) => a.score < b.score);

            // Construct a doubly-linked list of nodes that will be inserted into the priority queue,
            // starting with the individual characters. We also populate each node with a positional
            // bias to break ties in the priority queue.
            let startingNode = {
                token: word[0],
                bias: 0,
                prev: null,
                next: null,
            }

            let previousNode = startingNode
            for (let i = 1; i < word.length; ++i) {
                const currentNode = {
                    bias: i / word.length, // Add fractional component to break ties
                    token: word[i],
                    prev: previousNode,
                    next: null,
                }
                previousNode.next = currentNode
                this._add_node(queue, previousNode)
                previousNode = currentNode
            }

            while (!queue.isEmpty()) {
                // Get the next node with the highest priority
                const node = queue.pop();

                // Check that this merge is still possible
                if (node.deleted || !node.next || node.next.deleted) continue;

                // Here, we mark the current node (left side of the merge) and the next node (right side of the merge) as deleted.
                // This is because they will both be replaced by a new node representing the merge result.
                node.deleted = true;
                node.next.deleted = true;

                // Next, we fix the node that comes before the current node (i.e., left side of the merge).
                if (node.prev) {

                    // Make a shallow copy of the previous node
                    const newPreviousNode = { ...node.prev };

                    // Mark the old previous node as deleted. This avoids erroneous merges later,
                    // because there may still be references to this node in the priority queue.
                    node.prev.deleted = true;
                    node.prev = newPreviousNode;

                    // Update the reference of the previous node, by pointing its previous node to this new previous node.
                    if (newPreviousNode.prev) {
                        newPreviousNode.prev.next = newPreviousNode;
                    } else {
                        // If the previous of the previous node does not exist, it means that
                        // `newPreviousNode` must be the new `startingNode`.
                        startingNode = newPreviousNode;
                    }
                }

                // Create a new node which represents the result of the merge.
                const merged = {
                    token: node.token + node.next.token,
                    bias: node.bias,
                    prev: node.prev,
                    next: node.next.next,
                }

                // We now consider where we can add the new merged node to the priority queue:
                // 1. prev <-> merged
                if (merged.prev) {
                    merged.prev.next = merged;
                    this._add_node(queue, merged.prev);
                } else {
                    // If `merged.prev` does not exist, then `merged` must be the new `startingNode`.
                    startingNode = merged;
                }

                // 2. merged <-> next
                if (merged.next) {
                    merged.next.prev = merged;
                    this._add_node(queue, merged);
                }
            }

            // Traverse the linked list, starting from the `startingNode`, and collect the tokens.
            for (let currentNode = startingNode; currentNode !== null; currentNode = currentNode.next) {
                result.push(currentNode.token);
            }
        } else {
            result = word;
        }

        // Possibly append suffix
        if (this.continuing_subword_suffix) {
            // Do not append suffix to the last token
            for (let i = 0; i < result.length - 1; ++i) {
                result[i] += this.continuing_subword_suffix;
            }
        }

        if (token.length < this.max_length_to_cache) {
            // Save the result to the cache
            this.cache.put(token, result);
        }

        return result;
    }