__forceinline uint32_t traverseBlocks()

in libraries/hvvr/raycaster/traverse_avx.h [316:410]


__forceinline uint32_t traverseBlocks(BlockFrame& frame,
                                      const BVHNode* node,
                                      const Frustum& frustum) {
    if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
        return 0;
    }
    auto negateMask = m256(-0.0f);
    uint32_t top = 0;
    auto tMin = 0.0f;
    goto TEST;

POP:
    // up to 4 entries can replace the current entry:
    // 3 internal nodes + 1 set of leaf nodes, or
    // 4 internal nodes
    if (top >= BlockFrame::stackSize - 4 || // Don't flatten too much of the hierarchy before tile cull, that could
                                            // break the advantage of a BVH.
        top == 0 ||                         // We processed everything, and nothing remains
        (frame.cullStack - 1 + top)->tDeltaBytes <= 0xf) { // We processed everything, and only leaf nodes remain
        return top;
    }

    --top;
    node = (frame.cullStack + top)->node;
    tMin = (frame.cullStack + top)->tMinTemp;
TEST:
    // block frustum vs 4x AABB of children, corresponding bit is set if AABB is hit
    unsigned int mask = frustum.testBVHNodeChildren(*node);
    // spend some additional time to refine the test results
    mask = frustum.testBVHNodeChildrenRefine(*node, mask);

    // mask is currently bitmask of which children are hit by ray
    if (!mask)
        goto POP;

    auto leaf = mask & node->boxData.leafMask;
    if (leaf) {
        mask &= ~leaf;

        // insert leaf nodes into the bottom of the stack
        __m128 entry;
        auto i = top;
        goto PUSH_LEAF_LOOP_ENTRY;
        do {
            store(&(frame.cullStack + i)->tDelta, entry);
            --i;
        PUSH_LEAF_LOOP_ENTRY:
            entry = load_m128(&(frame.cullStack - 1 + i)->tDelta);
        } while (_mm_comigt_ss(entry, m128(0)));
        ++top;
        (frame.cullStack + i)->node = node;
        (frame.cullStack + i)->tDeltaBytes = leaf;
        (frame.cullStack + i)->tMinTemp = tMin;
    }

    // mask is currently bitmask of which non-leaf children are hit by ray
    if (!mask)
        goto POP;

    // conservative min ray bundle distance to children along the primary traversal axis (stick it into a temp)
    store(frame.tMin,
          _mm_xor_ps(m128(negateMask), load_m128((float*)((uintptr_t)node + frustum.distanceEstimateOffset))));
    // size of children along the primary traversal axis
    // assumes BVHNode stores float xMax[4] at a multiple of 32 bytes, immediately followed 16 bytes later by xNegMin[4]
    // (and the same for y and z)
    int maxOffset =
        frustum.distanceEstimateOffset & ~16; // remove neg/pos sign choice from distanceEstimateOffset to get Max
    int negMinOffset = maxOffset + 16;        // offset to get NegMin
    store(frame.tDeltaMem,
          load_m128((float*)((uintptr_t)node + maxOffset)) + load_m128((float*)((uintptr_t)node + negMinOffset)));

    // insert each child into the stack, sorting such that the top of the stack is the node
    // with the largest dimensions along the primary traversal axis
    goto CHILD_LOOP_ENTRY;
    do {
        ++top;
    CHILD_LOOP_ENTRY:
        uint32_t k = tzcnt(mask);
        __m128 entry, tTempDelta = _mm_load_ss(frame.tDeltaMem + k);
        auto i = top;
        goto PUSH_LOOP_ENTRY;
        do {
            store(&(frame.cullStack + i)->tDelta, entry);
            --i;
        PUSH_LOOP_ENTRY:
            entry = load_m128(&(frame.cullStack - 1 + i)->tDelta);
        } while (_mm_comigt_ss(entry, tTempDelta));
        (frame.cullStack + i)->node = node + node->boxData.children.offset[k];
        _mm_store_ss(&(frame.cullStack + i)->tDelta, tTempDelta);
        (frame.cullStack + i)->tMinTemp = frame.tMin[k];
        mask = clearLowestBit(mask);
    } while (mask);
    ++top;
    goto POP;
}