void SuffixTree::append()

in csrc/suffix_decoding/suffix_tree.cc [33:165]


void SuffixTree::append(int seq_id, int token) {
    // Initialize the sequence if it doesn't exist.
    _seqs.try_emplace(seq_id);
    _active_nodes.try_emplace(seq_id);

    // Insert a new active node at the root.
    _active_nodes[seq_id].push_back(_root.get());
    _root->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
    _root->count += 1;

    // Ensure the number of active nodes doesn't exceed max_depth.
    if (_active_nodes[seq_id].size() > static_cast<size_t>(_max_depth)) {
        _active_nodes[seq_id].pop_front();
    }
    _seqs[seq_id].push_back(token);
    
    // Iterate over all active nodes for this sequence.
    for (size_t i = 0; i < _active_nodes[seq_id].size(); ++i) {
        Node* node = _active_nodes[seq_id][i];
        Node* child = nullptr;
        if (node->children.contains(token)) {
            child = node->children[token].get();
        }

        assert(node->endpoints.contains(seq_id));
        assert(node->endpoints[seq_id] == _seqs[seq_id].size() - 1);

        if (child == nullptr) {
            // No existing child node for the new token.
            if (node->count == 1 && node != _root.get()) {
                // The active node has count = 1, which means the only suffix that ends here is the
                // one that's being extended right now. Then this node should be a leaf node, and
                // we can simply extend the length of this node.
                assert(node->children.empty());
                assert(node->ref_seq == seq_id);
                node->length += 1;
                node->endpoints[seq_id] += 1;
            } else {
                // Either this is the root node, or the current suffix is not the only one that
                // ends here. Either case, we need to extend the current suffix into a new child.
                Node* new_child = new Node();
                new_child->token = token;
                new_child->parent = node;
                new_child->count = 1;
                new_child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
                new_child->ref_seq = seq_id;
                new_child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
                new_child->length = 1;
                node->children.emplace(token, new_child);
                node->endpoints.erase(seq_id);
                _active_nodes[seq_id][i] = new_child;
            }
        }
        else if (node->count == child->count + 1 && node != _root.get()) {
            // The active node has a child for the new token, and the child's count is exactly one
            // fewer than the active node's count. Since the suffix for the active node ends here,
            // that means all other suffixes that pass through this node must go to that child.
            assert(node->children.size() == 1);  // The active node should have only one child.
            assert(node->endpoints.size() == 1);  // Only the current suffix should end here.
            if (child->length == 1) {
                // The child only has length 1. If we append the new token to the current suffix,
                // then it will perfectly overlap with the child. In this case, we should just fuse
                // the current suffix into the child and eliminate the current node.
                Node* parent = node->parent;
                // Update child to take the place of the current node.
                child->token = node->token;
                child->count += 1;  // Current suffix extends into the child
                child->length = node->length + 1;
                child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
                child->ref_seq = seq_id;
                child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - child->length;
                child->parent = parent;
                // Give ownership of child pointer to parent and should also free the current node.
                assert(parent->children.contains(child->token));
                assert(parent->children[child->token].get() == node);
                Node* tmp = node->children[token].release();
                parent->children[child->token].reset(tmp);
                // Replace active node with child node.
                _active_nodes[seq_id][i] = child;
            } else {
                // The child has length > 1. If we append the new token to the current suffix, then
                // it still does not reach the child node. In this case, we keep both nodes but
                // extend the length of the current node by 1 into the child node.
                node->length += 1;
                node->endpoints[seq_id] += 1;
                node->ref_seq = seq_id;
                node->ref_idx = static_cast<int>(_seqs[seq_id].size()) - node->length;
                child->length -= 1;
                child->ref_idx += 1;
                // The child node's first token should be updated to its second token.
                child->token = _seqs[child->ref_seq][child->ref_idx];
                if (child->token != token) {
                    Node* tmp = node->children[token].release();
                    node->children.emplace(child->token, tmp);
                    node->children.erase(token);
                }
            }
        }
        else {
            // There is a child for the new token, and should move the active node into that child.
            if (child->length == 1) {
                // The child node has length 1, just update the active node pointer to it.
                node->endpoints.erase(seq_id);
                child->count += 1;
                child->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
                child->ref_seq = seq_id;
                child->ref_idx = static_cast<int>(_seqs[seq_id].size()) - 1;
                _active_nodes[seq_id][i] = child;
            } else {
                // The child node has length > 1. If we extend the current suffix into it, then it
                // must be split into a segment of length 1 and another segment with the remainder.
                Node* new_node = new Node();
                new_node->token = token;
                new_node->count = child->count + 1;
                new_node->parent = node;
                new_node->length = 1;
                new_node->endpoints[seq_id] = static_cast<int>(_seqs[seq_id].size());
                new_node->ref_seq = seq_id;
                new_node->ref_idx = static_cast<int>(_seqs[seq_id].size()) - new_node->length;
                // The child node's first token should be updated to its second token.
                child->token = _seqs[child->ref_seq][child->ref_idx + 1];
                Node* tmp = node->children[token].release();
                new_node->children.emplace(child->token, tmp);
                node->children[token].reset(new_node);
                node->endpoints.erase(seq_id);
                child->parent = new_node;
                child->length -= 1;
                child->ref_idx += 1;
                _active_nodes[seq_id][i] = new_node;
            }
        }
    }
}