in libraries/hvvr/raycaster/traverse_avx.h [316:410]
__forceinline uint32_t traverseBlocks(BlockFrame& frame,
const BVHNode* node,
const Frustum& frustum) {
if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
return 0;
}
auto negateMask = m256(-0.0f);
uint32_t top = 0;
auto tMin = 0.0f;
goto TEST;
POP:
// up to 4 entries can replace the current entry:
// 3 internal nodes + 1 set of leaf nodes, or
// 4 internal nodes
if (top >= BlockFrame::stackSize - 4 || // Don't flatten too much of the hierarchy before tile cull, that could
// break the advantage of a BVH.
top == 0 || // We processed everything, and nothing remains
(frame.cullStack - 1 + top)->tDeltaBytes <= 0xf) { // We processed everything, and only leaf nodes remain
return top;
}
--top;
node = (frame.cullStack + top)->node;
tMin = (frame.cullStack + top)->tMinTemp;
TEST:
// block frustum vs 4x AABB of children, corresponding bit is set if AABB is hit
unsigned int mask = frustum.testBVHNodeChildren(*node);
// spend some additional time to refine the test results
mask = frustum.testBVHNodeChildrenRefine(*node, mask);
// mask is currently bitmask of which children are hit by ray
if (!mask)
goto POP;
auto leaf = mask & node->boxData.leafMask;
if (leaf) {
mask &= ~leaf;
// insert leaf nodes into the bottom of the stack
__m128 entry;
auto i = top;
goto PUSH_LEAF_LOOP_ENTRY;
do {
store(&(frame.cullStack + i)->tDelta, entry);
--i;
PUSH_LEAF_LOOP_ENTRY:
entry = load_m128(&(frame.cullStack - 1 + i)->tDelta);
} while (_mm_comigt_ss(entry, m128(0)));
++top;
(frame.cullStack + i)->node = node;
(frame.cullStack + i)->tDeltaBytes = leaf;
(frame.cullStack + i)->tMinTemp = tMin;
}
// mask is currently bitmask of which non-leaf children are hit by ray
if (!mask)
goto POP;
// conservative min ray bundle distance to children along the primary traversal axis (stick it into a temp)
store(frame.tMin,
_mm_xor_ps(m128(negateMask), load_m128((float*)((uintptr_t)node + frustum.distanceEstimateOffset))));
// size of children along the primary traversal axis
// assumes BVHNode stores float xMax[4] at a multiple of 32 bytes, immediately followed 16 bytes later by xNegMin[4]
// (and the same for y and z)
int maxOffset =
frustum.distanceEstimateOffset & ~16; // remove neg/pos sign choice from distanceEstimateOffset to get Max
int negMinOffset = maxOffset + 16; // offset to get NegMin
store(frame.tDeltaMem,
load_m128((float*)((uintptr_t)node + maxOffset)) + load_m128((float*)((uintptr_t)node + negMinOffset)));
// insert each child into the stack, sorting such that the top of the stack is the node
// with the largest dimensions along the primary traversal axis
goto CHILD_LOOP_ENTRY;
do {
++top;
CHILD_LOOP_ENTRY:
uint32_t k = tzcnt(mask);
__m128 entry, tTempDelta = _mm_load_ss(frame.tDeltaMem + k);
auto i = top;
goto PUSH_LOOP_ENTRY;
do {
store(&(frame.cullStack + i)->tDelta, entry);
--i;
PUSH_LOOP_ENTRY:
entry = load_m128(&(frame.cullStack - 1 + i)->tDelta);
} while (_mm_comigt_ss(entry, tTempDelta));
(frame.cullStack + i)->node = node + node->boxData.children.offset[k];
_mm_store_ss(&(frame.cullStack + i)->tDelta, tTempDelta);
(frame.cullStack + i)->tMinTemp = frame.tMin[k];
mask = clearLowestBit(mask);
} while (mask);
++top;
goto POP;
}