uint64_t Raycaster::buildTileTriangleLists()

in libraries/hvvr/raycaster/traversal.cpp [213:330]


uint64_t Raycaster::buildTileTriangleLists(const RayHierarchy& rayHierarchy, Camera_StreamedData* streamed) {
    const BVHNode* nodes = _nodes.data();
    ArrayView<uint32_t> triIndices(streamed->triIndices.dataHost(), streamed->triIndices.size());

#if DEBUG_STATS
    std::vector<double> blockFrustaAngle(rayHierarchy.blockFrusta.size());
    for (int i = 0; i < rayHierarchy.blockFrusta.size(); ++i) {
        blockFrustaAngle[i] = solidAngle(rayHierarchy.blockFrusta[i]);
    }
    size_t validBlocks;
    vector4 m4X = minMaxMeanMedian(blockFrustaAngle, validBlocks);
    printf("Block: Min,Max,Mean,Median: %g, %g, %g, %g\n", m4X.x, m4X.y, m4X.z, m4X.w);
    printf("Percent of sphere covered by block frusta: %g\n", 100.0 * m4X.z * validBlocks / (4 * M_PI));

    std::vector<double> tileFrustaAngle(rayHierarchy.tileFrusta.size());
    for (int i = 0; i < rayHierarchy.tileFrusta.size(); ++i) {
        // Convert to square degrees from steradians
        tileFrustaAngle[i] = solidAngle(rayHierarchy.tileFrusta[i]);
    }
    size_t validTiles;
    m4X = minMaxMeanMedian(tileFrustaAngle, validTiles);
    printf("Tile: Min,Max,Mean,Median: %g, %g, %g, %g\n", m4X.x, m4X.y, m4X.z, m4X.w);
    printf("Percent of sphere covered by tile frusta: %g\n", 100.0 * m4X.z * validTiles / (4 * M_PI));
#endif

#if DEBUG_STATS || TIME_BLOCK_CULL
    Timer timer;
#endif

#if VERIFY_TRAVERSAL
    impBlockStackSum = 0;
    refBlockStackSum = 0;
    impTileTriSum = 0;
    refTileTriSum = 0;
#endif

    enum { maxTasks = 4096 };
    // workload per task
    // 1 is too low, and incurs overhead from switching tasks too frequently inside the thread pool
    // 2 seems the fastest (though this could vary depending on the scene and sample distribution)
    // 3+ seems to become less efficient due to workload balancing
    enum { blocksPerThread = 2 };
    uint32_t blockCount = uint32_t(rayHierarchy.blockFrusta.size());
    uint32_t numTasks = (blockCount + blocksPerThread - 1) / blocksPerThread;
    assert(numTasks <= maxTasks);
    numTasks = min<uint32_t>(maxTasks, numTasks);
    uint32_t triSpacePerThread = uint32_t(triIndices.size() / numTasks);
    assert(triSpacePerThread * numTasks <= triIndices.size());

    std::future<void> taskResults[maxTasks];
    static TaskData taskData[maxTasks];
    for (uint32_t i = 0; i < numTasks; ++i) {
        uint32_t startTriIndex = i * triSpacePerThread;
        uint32_t endTriIndex = startTriIndex + triSpacePerThread;
        taskData[i].reset(triIndices, startTriIndex, endTriIndex);

        uint32_t startBlock = min(blockCount, i * blocksPerThread);
        uint32_t endBlock = min(blockCount, (i + 1) * blocksPerThread);
        if (i == numTasks - 1)
            assert(endBlock == blockCount);

        taskResults[i] = _threadPool->addTask(cullThread, rayHierarchy, startBlock, endBlock, nodes, &taskData[i]);
    }

    uint64_t triIndexCount = 0;
    uint32_t maxTaskTriCount = 0;
    uint32_t tileTriOffsetsStreamed = 0;
    uint32_t* streamTileIndexRemapEmpty = streamed->tileIndexRemapEmpty.dataHost();
    uint32_t* streamTileIndexRemapOccupied = streamed->tileIndexRemapOccupied.dataHost();
    TileTriRange* streamTileTriRanges = streamed->tileTriRanges.dataHost();
    for (uint32_t taskIndex = 0; taskIndex < numTasks; taskIndex++) {
        taskResults[taskIndex].get();
        const TaskData& task = taskData[taskIndex];

        for (auto emptyTileIndex : task.tileIndexRemapEmpty) {
            streamTileIndexRemapEmpty[streamed->tileCountEmpty++] = emptyTileIndex;
        }

        for (auto occupiedTileIndex : task.tileIndexRemapOccupied) {
            streamTileIndexRemapOccupied[streamed->tileCountOccupied++] = occupiedTileIndex;
        }

        // tileTriRange within the task is relative to the task's smaller view of the buffer and needs to be offset
        uint32_t taskTriOffset = uint32_t(task.triIndices.data() - streamed->triIndices.dataHost());
        for (auto tileTriRange : task.tileTriRanges) {
            TileTriRange triRangeGlobal;
            triRangeGlobal.start = taskTriOffset + tileTriRange.start;
            triRangeGlobal.end = taskTriOffset + tileTriRange.end;
            streamTileTriRanges[tileTriOffsetsStreamed++] = triRangeGlobal;
        }

        triIndexCount += task.triIndexCount;
        maxTaskTriCount = max(maxTaskTriCount, task.triIndexCount);
    }
    (void)maxTaskTriCount;
#if DEBUG_STATS || TIME_BLOCK_CULL
    double deltaTime = timer.get();
    static double minDeltaTime = DBL_MAX;
    minDeltaTime = min(minDeltaTime, deltaTime);

    static uint64_t frameIndex = 0;
    enum { profileFrameSkip = 64 };
    if (frameIndex % profileFrameSkip == 0) {
        printf("---- Block cull time: %.2fms, min %.2fms\n", deltaTime * 1000.0, minDeltaTime * 1000.0);
    }
    frameIndex++;
#endif
#if DEBUG_STATS
    printf("Total Triangle Idx Count %u\n", triIndexCount);
    printf("Max Triangle Idx Count Per Task %u\n", maxTaskTriCount);
#endif
#if VERIFY_TRAVERSAL
    printf("Traversal imp/ref blockStackSum, tileTriSum: %d/%d, %d/%d\n",
        int(impBlockStackSum), int(refBlockStackSum),
        int(impTileTriSum), int(refTileTriSum));
#endif
    return triIndexCount;
}