in csrc/suffix_decoding/suffix_tree.cc [342:400]
std::string SuffixTree::_check_node_integrity(Node* node) {
int64_t children_count = 0;
for (const auto& [token, child] : node->children) {
// Do all my children have me as their parent?
CHECK_OR_RETURN(child->parent == node, "child node has incorrect parent pointer");
children_count++;
}
// Is my counter at least the sum of my childrens' counters?
CHECK_OR_RETURN(children_count <= node->count, "node count is less than sum children counts");
if (node == _root.get()) {
// Root node can stop here after some simple checks.
CHECK_OR_RETURN(node->count >= 0, "root node has negative count");
CHECK_OR_RETURN(node->parent == nullptr, "root node has non-null parent pointer");
CHECK_OR_RETURN(node->length == 0, "root node has non-zero length");
CHECK_OR_RETURN(node->endpoints.empty(), "root node has non-empty endpoints");
CHECK_OR_RETURN(node->ref_idx == -1, "root node has invalid ref_idx");
return "";
}
// Is my length positive? Otherwise, I shouldn't exist.
CHECK_OR_RETURN(node->length > 0, "internal node has non-positive length");
// Is my count positive? Otherwise, I shouldn't exist.
CHECK_OR_RETURN(node->count > 0, "internal node has non-positive count");
// Are all my children's counts less than mine? If equal, then we should have been merged.
for (const auto& [token, child] : node->children) {
CHECK_OR_RETURN(
child->count < node->count, "internal node count is not greater than child count");
}
// Check my reference sequence and index.
CHECK_OR_RETURN(_seqs.count(node->ref_seq), "internal node has invalid ref_seq");
CHECK_OR_RETURN(node->ref_idx >= 0, "internal node has invalid ref_idx");
CHECK_OR_RETURN(node->ref_idx + node->length <= _seqs[node->ref_seq].size(),
"internal node has invalid token range");
// Check my first token is correct.
CHECK_OR_RETURN(node->token == _seqs[node->ref_seq][node->ref_idx],
"internal node has incorrect first token");
// Check I am my parent's child.
CHECK_OR_RETURN(node->parent->children.contains(node->token),
"internal node is not a child of parent node");
CHECK_OR_RETURN(node->parent->children[node->token].get() == node,
"parent node has incorrect child pointer");
// Check all my endpoint references are correct.
for (auto [seq_id, end_idx] : node->endpoints) {
CHECK_OR_RETURN(_seqs.count(seq_id), "node endpoint refers to nonexistent sequence");
CHECK_OR_RETURN(end_idx > 0 && end_idx <= _seqs[seq_id].size(), "invalid endpoint index");
// Check all tokens from the start of the suffix to the endpoint.
Node* n = node;
int idx = end_idx;
do {
CHECK_OR_RETURN(n->length <= idx, "invalid endpoint length");
idx -= n->length;
for (int i = 0; i < n->length; ++i) {
int tok = _seqs[n->ref_seq][n->ref_idx + i];
CHECK_OR_RETURN(_seqs[seq_id][idx + i] == tok, "invalid endpoint token");
}
n = n->parent;
} while (n != nullptr);
}
return "";
}