__forceinline uint32_t traverseTiles()

in libraries/hvvr/raycaster/traverse_avx.h [412:527]


__forceinline uint32_t traverseTiles(uint32_t* triIndices,
                                     uint32_t maxTriCount,
                                     TileFrame& frame,
                                     uint32_t top,
                                     const Frustum& frustum) {
    (void)maxTriCount;
    if (frustum.plane[0].w == -std::numeric_limits<float>::infinity()) {
        return 0;
    }
    auto negateMask = m256(-0.0f);
    uint32_t triCount = 0;
    unsigned mask, leaf;
    --top;
    goto POP_LAST;

POP:
    // pop the node stack
    if (!top)
        return triCount;
    --top;
// if we know the stack has at least one entry, and top is already pointing to the next entry, we can skip
// the setup work in POP
POP_LAST:
    // grab a node to test the children of
    auto node = (frame.stack + top)->node;

TEST:
    // tile frustum vs 4x AABB of children, corresponding bit is set if AABB is hit
    mask = frustum.testBVHNodeChildren(*node);

    // no children hit? Move onto the next node
    if (!mask)
        goto POP;

    // were any of the hit children leaf nodes? If so, add the children to the output list.
    leaf = mask & node->boxData.leafMask;
    if (leaf) {
        // spend some additional time to refine the test results
        // this is a CPU vs GPU tradeoff
        // potentially more time culling on the CPU, in exchange for fewer triangles on the GPU
        mask = frustum.testBVHNodeChildrenRefine(*node, mask);

        leaf = mask & node->boxData.leafMask;
        if (leaf) {
            uint32_t emitMask = leaf;
            do {
                auto k = tzcnt(emitMask);
                auto tri = node->boxData.leaf.triIndex[k], e = (node->boxData.leaf.triIndex + 1)[k];
                do {
                    triIndices[triCount++] = tri;

                    // MAX_TRI_INDICES_TO_INTERSECT too small for these parameters, try increasing TILE_SIZE
                    assert(triCount <= maxTriCount);
                } while (++tri != e);
                emitMask = clearLowestBit(emitMask);
            } while (emitMask);
        }
    }

    // strip leaf nodes from the hit mask, now we only care about the remaining hit internal nodes
    mask &= ~node->boxData.leafMask;
    // if there's nothing left, move onto the next stack entry
    if (!mask)
        goto POP;

    // This is needed because block cull inserts nodes that contain a child mix of hit leaf nodes and hit internal
    // nodes up to 4 times into the stack. Leaf node children are grouped into a single entry w/ hit mask, internal node
    // children each get their own entry. This test prevents the leaf entry from spawning duplicate copies of the
    // prepopulated sibling internal nodes.
    // mask will be 0 for internal nodes, or the leaf hit mask for nodes which contain leaf children
    if ((frame.stack + top)->mask)
        goto POP;

    // conservative min ray bundle distance to children along the primary traversal axis (stick it into a temp)
    store(frame.tMin,
          _mm_xor_ps(m128(negateMask), load_m128((float*)((uintptr_t)node + frustum.distanceEstimateOffset))));

    auto k = tzcnt(mask);        // index of first hit child node
                                 // If I only hit one child, avoid stack
    if (!clearLowestBit(mask)) { // only one bit set?
        // if we only hit one child node, avoid inserting into the stack. Directly replace the current node
        // with the hit node, then jump to testing it.
        node = node + node->boxData.children.offset[k];
        goto TEST;
    }
    // otherwise, we hit multiple child nodes and need to insert each of them into the stack, one at a time,
    // sorted by their distance estimates
    goto CHILD_LOOP_ENTRY;
    do {
        ++top;           // make room on the stack
        k = tzcnt(mask); // index of first hit child node

    CHILD_LOOP_ENTRY:
        __m128 tMinTemp = _mm_load_ss(frame.tMin + k); // grab the temp distance estimate we stored earlier
        // search for an insertion point, bubbling up existing entries with smaller distances
        // such that the top of the stack has the element with the least distance
        __m128 entry;
        auto i = top;
        goto PUSH_LOOP_ENTRY;
        do {
            store(&(frame.stack + i)->tMin, entry); // this is a 128-bit store containing a whole stack entry
            --i;
        PUSH_LOOP_ENTRY:
            entry = load_m128(&(frame.stack - 1 + i)->tMin);
        } while (_mm_comilt_ss(entry, tMinTemp));

        // we've found our insertion point, place the child there
        (frame.stack + i)->node = node + node->boxData.children.offset[k];
        (frame.stack + i)->mask = 0;
        _mm_store_ss(&(frame.stack + i)->tMin, tMinTemp);

        mask = clearLowestBit(mask);
    } while (mask);

    goto POP_LAST;
}