in src/tokenizers.js [761:885]
bpe(token) {
if (token.length === 0) {
return [];
}
const cached = this.cache.get(token);
if (cached !== undefined) {
return cached;
}
const word = Array.from(token);
if (this.end_of_word_suffix) {
word[word.length - 1] += this.end_of_word_suffix;
}
let result = [];
if (word.length > 1) {
// Create a priority queue to store the nodes that will be merged.
// The comparator function compares the scores of the nodes.
const queue = new PriorityQueue((a, b) => a.score < b.score);
// Construct a doubly-linked list of nodes that will be inserted into the priority queue,
// starting with the individual characters. We also populate each node with a positional
// bias to break ties in the priority queue.
let startingNode = {
token: word[0],
bias: 0,
prev: null,
next: null,
}
let previousNode = startingNode
for (let i = 1; i < word.length; ++i) {
const currentNode = {
bias: i / word.length, // Add fractional component to break ties
token: word[i],
prev: previousNode,
next: null,
}
previousNode.next = currentNode
this._add_node(queue, previousNode)
previousNode = currentNode
}
while (!queue.isEmpty()) {
// Get the next node with the highest priority
const node = queue.pop();
// Check that this merge is still possible
if (node.deleted || !node.next || node.next.deleted) continue;
// Here, we mark the current node (left side of the merge) and the next node (right side of the merge) as deleted.
// This is because they will both be replaced by a new node representing the merge result.
node.deleted = true;
node.next.deleted = true;
// Next, we fix the node that comes before the current node (i.e., left side of the merge).
if (node.prev) {
// Make a shallow copy of the previous node
const newPreviousNode = { ...node.prev };
// Mark the old previous node as deleted. This avoids erroneous merges later,
// because there may still be references to this node in the priority queue.
node.prev.deleted = true;
node.prev = newPreviousNode;
// Update the reference of the previous node, by pointing its previous node to this new previous node.
if (newPreviousNode.prev) {
newPreviousNode.prev.next = newPreviousNode;
} else {
// If the previous of the previous node does not exist, it means that
// `newPreviousNode` must be the new `startingNode`.
startingNode = newPreviousNode;
}
}
// Create a new node which represents the result of the merge.
const merged = {
token: node.token + node.next.token,
bias: node.bias,
prev: node.prev,
next: node.next.next,
}
// We now consider where we can add the new merged node to the priority queue:
// 1. prev <-> merged
if (merged.prev) {
merged.prev.next = merged;
this._add_node(queue, merged.prev);
} else {
// If `merged.prev` does not exist, then `merged` must be the new `startingNode`.
startingNode = merged;
}
// 2. merged <-> next
if (merged.next) {
merged.next.prev = merged;
this._add_node(queue, merged);
}
}
// Traverse the linked list, starting from the `startingNode`, and collect the tokens.
for (let currentNode = startingNode; currentNode !== null; currentNode = currentNode.next) {
result.push(currentNode.token);
}
} else {
result = word;
}
// Possibly append suffix
if (this.continuing_subword_suffix) {
// Do not append suffix to the last token
for (let i = 0; i < result.length - 1; ++i) {
result[i] += this.continuing_subword_suffix;
}
}
if (token.length < this.max_length_to_cache) {
// Save the result to the cache
this.cache.put(token, result);
}
return result;
}