in csrc/suffix_decoding/suffix_tree.cc [33:165]
void SuffixTree::append(int seq_id, int token) {
// Initialize the sequence if it doesn't exist.
_seqs.try_emplace(seq_id);
_active_nodes.try_emplace(seq_id);
// Insert a new active node at the root.
_active_nodes[seq_id].push_back(_root.get());
_root->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
_root->count += 1;
// Ensure the number of active nodes doesn't exceed max_depth.
if (_active_nodes[seq_id].size() > static_cast<size_t>(_max_depth)) {
_active_nodes[seq_id].pop_front();
}
_seqs[seq_id].push_back(token);
// Iterate over all active nodes for this sequence.
for (size_t i = 0; i < _active_nodes[seq_id].size(); ++i) {
Node* node = _active_nodes[seq_id][i];
Node* child = nullptr;
if (node->children.contains(token)) {
child = node->children[token].get();
}
assert(node->endpoints.contains(seq_id));
assert(node->endpoints[seq_id] == _seqs[seq_id].size() - 1);
if (child == nullptr) {
// No existing child node for the new token.
if (node->count == 1 && node != _root.get()) {
// The active node has count = 1, which means the only suffix that ends here is the
// one that's being extended right now. Then this node should be a leaf node, and
// we can simply extend the length of this node.
assert(node->children.empty());
assert(node->ref_seq == seq_id);
node->length += 1;
node->endpoints[seq_id] += 1;
} else {
// Either this is the root node, or the current suffix is not the only one that
// ends here. Either case, we need to extend the current suffix into a new child.
Node* new_child = new Node();
new_child->token = token;
new_child->parent = node;
new_child->count = 1;
new_child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
new_child->ref_seq = seq_id;
new_child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
new_child->length = 1;
node->children.emplace(token, new_child);
node->endpoints.erase(seq_id);
_active_nodes[seq_id][i] = new_child;
}
}
else if (node->count == child->count + 1 && node != _root.get()) {
// The active node has a child for the new token, and the child's count is exactly one
// fewer than the active node's count. Since the suffix for the active node ends here,
// that means all other suffixes that pass through this node must go to that child.
assert(node->children.size() == 1); // The active node should have only one child.
assert(node->endpoints.size() == 1); // Only the current suffix should end here.
if (child->length == 1) {
// The child only has length 1. If we append the new token to the current suffix,
// then it will perfectly overlap with the child. In this case, we should just fuse
// the current suffix into the child and eliminate the current node.
Node* parent = node->parent;
// Update child to take the place of the current node.
child->token = node->token;
child->count += 1; // Current suffix extends into the child
child->length = node->length + 1;
child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
child->ref_seq = seq_id;
child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - child->length;
child->parent = parent;
// Give ownership of child pointer to parent and should also free the current node.
assert(parent->children.contains(child->token));
assert(parent->children[child->token].get() == node);
Node* tmp = node->children[token].release();
parent->children[child->token].reset(tmp);
// Replace active node with child node.
_active_nodes[seq_id][i] = child;
} else {
// The child has length > 1. If we append the new token to the current suffix, then
// it still does not reach the child node. In this case, we keep both nodes but
// extend the length of the current node by 1 into the child node.
node->length += 1;
node->endpoints[seq_id] += 1;
node->ref_seq = seq_id;
node->ref_idx = static_cast<int>(_seqs[seq_id].size()) - node->length;
child->length -= 1;
child->ref_idx += 1;
// The child node's first token should be updated to its second token.
child->token = _seqs[child->ref_seq][child->ref_idx];
if (child->token != token) {
Node* tmp = node->children[token].release();
node->children.emplace(child->token, tmp);
node->children.erase(token);
}
}
}
else {
// There is a child for the new token, and should move the active node into that child.
if (child->length == 1) {
// The child node has length 1, just update the active node pointer to it.
node->endpoints.erase(seq_id);
child->count += 1;
child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
child->ref_seq = seq_id;
child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
_active_nodes[seq_id][i] = child;
} else {
// The child node has length > 1. If we extend the current suffix into it, then it
// must be split into a segment of length 1 and another segment with the remainder.
Node* new_node = new Node();
new_node->token = token;
new_node->count = child->count + 1;
new_node->parent = node;
new_node->length = 1;
new_node->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
new_node->ref_seq = seq_id;
new_node->ref_idx = static_cast<int>(_seqs[seq_id].size()) - new_node->length;
// The child node's first token should be updated to its second token.
child->token = _seqs[child->ref_seq][child->ref_idx + 1];
Node* tmp = node->children[token].release();
new_node->children.emplace(child->token, tmp);
node->children[token].reset(new_node);
node->endpoints.erase(seq_id);
child->parent = new_node;
child->length -= 1;
child->ref_idx += 1;
_active_nodes[seq_id][i] = new_node;
}
}
}
}