in gemmology.h [1079:1086]
void Write::operator()(
std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> result,
size_t row_idx, size_t col_idx, size_t col_size) {
std::get<0>(result).store_aligned(output_addr + row_idx * col_size + col_idx +
0);
std::get<1>(result).store_aligned(output_addr + row_idx * col_size + col_idx +
xsimd::batch<float, Arch>::size);
}