std::string SuffixTree::_check_node_integrity()

in csrc/suffix_decoding/suffix_tree.cc [342:400]


std::string SuffixTree::_check_node_integrity(Node* node) {
    int64_t children_count = 0;
    for (const auto& [token, child] : node->children) {
        // Do all my children have me as their parent?
        CHECK_OR_RETURN(child->parent == node, "child node has incorrect parent pointer");
        children_count++;
    }
    // Is my counter at least the sum of my childrens' counters?
    CHECK_OR_RETURN(children_count <= node->count, "node count is less than sum children counts");
    if (node == _root.get()) {
        // Root node can stop here after some simple checks.
        CHECK_OR_RETURN(node->count >= 0, "root node has negative count");
        CHECK_OR_RETURN(node->parent == nullptr, "root node has non-null parent pointer");
        CHECK_OR_RETURN(node->length == 0, "root node has non-zero length");
        CHECK_OR_RETURN(node->endpoints.empty(), "root node has non-empty endpoints");
        CHECK_OR_RETURN(node->ref_idx == -1, "root node has invalid ref_idx");
        return "";
    }
    // Is my length positive? Otherwise, I shouldn't exist.
    CHECK_OR_RETURN(node->length > 0, "internal node has non-positive length");
    // Is my count positive? Otherwise, I shouldn't exist.
    CHECK_OR_RETURN(node->count > 0, "internal node has non-positive count");
    // Are all my children's counts less than mine? If equal, then we should have been merged.
    for (const auto& [token, child] : node->children) {
        CHECK_OR_RETURN(
            child->count < node->count, "internal node count is not greater than child count");
    }
    // Check my reference sequence and index.
    CHECK_OR_RETURN(_seqs.count(node->ref_seq), "internal node has invalid ref_seq");
    CHECK_OR_RETURN(node->ref_idx >= 0, "internal node has invalid ref_idx");
    CHECK_OR_RETURN(node->ref_idx + node->length <= _seqs[node->ref_seq].size(),
                    "internal node has invalid token range");
    // Check my first token is correct.
    CHECK_OR_RETURN(node->token == _seqs[node->ref_seq][node->ref_idx],
                    "internal node has incorrect first token");
    // Check I am my parent's child.
    CHECK_OR_RETURN(node->parent->children.contains(node->token),
                    "internal node is not a child of parent node");
    CHECK_OR_RETURN(node->parent->children[node->token].get() == node,
                    "parent node has incorrect child pointer");
    // Check all my endpoint references are correct.
    for (auto [seq_id, end_idx] : node->endpoints) {
        CHECK_OR_RETURN(_seqs.count(seq_id), "node endpoint refers to nonexistent sequence");
        CHECK_OR_RETURN(end_idx > 0 && end_idx <= _seqs[seq_id].size(), "invalid endpoint index");
        // Check all tokens from the start of the suffix to the endpoint.
        Node* n = node;
        int idx = end_idx;
        do {
            CHECK_OR_RETURN(n->length <= idx, "invalid endpoint length");
            idx -= n->length;
            for (int i = 0; i < n->length; ++i) {
                int tok = _seqs[n->ref_seq][n->ref_idx + i];
                CHECK_OR_RETURN(_seqs[seq_id][idx + i] == tok, "invalid endpoint token");
            }
            n = n->parent;
        } while (n != nullptr);
    }
    return "";
}