inline __device__ void unpack_noscale()

in kernels/fmha/softmax.h [512:530]


    inline __device__ void unpack_noscale(const Accumulator (&acc)[MMAS_M][MMAS_N]) {

        #pragma unroll
        for( int mi = 0; mi < MMAS_M; ++mi ) {
            #pragma unroll
            for( int ni = 0; ni < MMAS_N; ++ni ) {
                // 1st row - 4 elements per row.
                this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0);
                this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1);
                this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4);
                this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5);
                // 2nd row - 4 elements per row.
                this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2);
                this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3);
                this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6);
                this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7);
            }
        }
    }