in libraries/hvvr/raycaster/traverse_avx.h [412:527]
__forceinline uint32_t traverseTiles(uint32_t* triIndices,
uint32_t maxTriCount,
TileFrame& frame,
uint32_t top,
const Frustum& frustum) {
(void)maxTriCount;
if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
return 0;
}
auto negateMask = m256(-0.0f);
uint32_t triCount = 0;
unsigned mask, leaf;
--top;
goto POP_LAST;
POP:
// pop the node stack
if (!top)
return triCount;
--top;
// if we know the stack has at least one entry, and top is already pointing to the next entry, we can skip
// the setup work in POP
POP_LAST:
// grab a node to test the children of
auto node = (frame.stack + top)->node;
TEST:
// tile frustum vs 4x AABB of children, corresponding bit is set if AABB is hit
mask = frustum.testBVHNodeChildren(*node);
// no children hit? Move onto the next node
if (!mask)
goto POP;
// were any of the hit children leaf nodes? If so, add the children to the output list.
leaf = mask & node->boxData.leafMask;
if (leaf) {
// spend some additional time to refine the test results
// this is a CPU vs GPU tradeoff
// potentially more time culling on the CPU, in exchange for fewer triangles on the GPU
mask = frustum.testBVHNodeChildrenRefine(*node, mask);
leaf = mask & node->boxData.leafMask;
if (leaf) {
uint32_t emitMask = leaf;
do {
auto k = tzcnt(emitMask);
auto tri = node->boxData.leaf.triIndex[k], e = (node->boxData.leaf.triIndex + 1)[k];
do {
triIndices[triCount++] = tri;
// MAX_TRI_INDICES_TO_INTERSECT too small for these parameters, try increasing TILE_SIZE
assert(triCount <= maxTriCount);
} while (++tri != e);
emitMask = clearLowestBit(emitMask);
} while (emitMask);
}
}
// strip leaf nodes from the hit mask, now we only care about the remaining hit internal nodes
mask &= ~node->boxData.leafMask;
// if there's nothing left, move onto the next stack entry
if (!mask)
goto POP;
// This is needed because block cull inserts nodes that contain a child mix of hit leaf nodes and hit internal
// nodes up to 4 times into the stack. Leaf node children are grouped into a single entry w/ hit mask, internal node
// children each get their own entry. This test prevents the leaf entry from spawning duplicate copies of the
// prepopulated sibling internal nodes.
// mask will be 0 for internal nodes, or the leaf hit mask for nodes which contain leaf children
if ((frame.stack + top)->mask)
goto POP;
// conservative min ray bundle distance to children along the primary traversal axis (stick it into a temp)
store(frame.tMin,
_mm_xor_ps(m128(negateMask), load_m128((float*)((uintptr_t)node + frustum.distanceEstimateOffset))));
auto k = tzcnt(mask); // index of first hit child node
// If I only hit one child, avoid stack
if (!clearLowestBit(mask)) { // only one bit set?
// if we only hit one child node, avoid inserting into the stack. Directly replace the current node
// with the hit node, then jump to testing it.
node = node + node->boxData.children.offset[k];
goto TEST;
}
// otherwise, we hit multiple child nodes and need to insert each of them into the stack, one at a time,
// sorted by their distance estimates
goto CHILD_LOOP_ENTRY;
do {
++top; // make room on the stack
k = tzcnt(mask); // index of first hit child node
CHILD_LOOP_ENTRY:
__m128 tMinTemp = _mm_load_ss(frame.tMin + k); // grab the temp distance estimate we stored earlier
// search for an insertion point, bubbling up existing entries with smaller distances
// such that the top of the stack has the element with the least distance
__m128 entry;
auto i = top;
goto PUSH_LOOP_ENTRY;
do {
store(&(frame.stack + i)->tMin, entry); // this is a 128-bit store containing a whole stack entry
--i;
PUSH_LOOP_ENTRY:
entry = load_m128(&(frame.stack - 1 + i)->tMin);
} while (_mm_comilt_ss(entry, tMinTemp));
// we've found our insertion point, place the child there
(frame.stack + i)->node = node + node->boxData.children.offset[k];
(frame.stack + i)->mask = 0;
_mm_store_ss(&(frame.stack + i)->tMin, tMinTemp);
mask = clearLowestBit(mask);
} while (mask);
goto POP_LAST;
}