inline __device__ Smem_tile_mma_epilogue()

in candle-flash-attn-v1/kernels/fmha/smem_tile.h [1366:1375]


    inline __device__ Smem_tile_mma_epilogue(char *smem, int tidx) : Base(smem, tidx) {
        uint32_t smem_ = __nvvm_get_smem_pointer(smem);
        const int read_row = tidx / THREADS_PER_ROW;
        int read_col = tidx % THREADS_PER_ROW;
        // read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 : (Base::BYTES_PER_ROW == 64 ? 0x03 : 0x07)));
        static_assert(Base::BYTES_PER_ROW == 32 || Base::BYTES_PER_ROW == 64 || Base::BYTES_PER_ROW == 128 || Base::BYTES_PER_ROW == 256);
        read_col ^= (read_row & (Base::BYTES_PER_ROW == 32 ? 0x01 : (Base::BYTES_PER_ROW == 64 ? 0x03 : (Base::BYTES_PER_ROW == 128 ? 0x07 : 0x07))));
        // read_offset_ = read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
        smem_read_ = smem_ + read_row * BYTES_PER_ROW + read_col * BYTES_PER_LDS;
    }