__device__ inline void thread_reduce_()

in kernels/fmha/softmax.h [533:542]


    __device__ inline void thread_reduce_(float (&frag)[2 * MMAS_M], Operator &op) {
        #pragma unroll
        for( int mi = 0; mi < 2 * MMAS_M; mi++ ) {
            frag[mi] = zero_init ? this->elt_[mi][0] : op(frag[mi], this->elt_[mi][0]);
            #pragma unroll
            for( int ni = 1; ni < 4 * MMAS_N; ni++ ) {
                frag[mi] = op(frag[mi], this->elt_[mi][ni]);
            }
        }
    }