in src/gf2x/gf2x_ksqr_avx512.c [31:72]
_INLINE_ void generate_map(OUT uint16_t *map, IN const size_t l_param)
{
__m512i vmap[NUM_ZMMS], vr, inc;
__mmask32 mask[NUM_ZMMS];
// The permutation map is generated in the following way:
// 1. for i = 0 to map size:
// 2. map[i] = (i * l_param) % r
// However, to avoid the expensive multiplication and modulo operations
// we modify the algorithm to:
// 1. map[0] = l_param
// 2. for i = 1 to map size:
// 3. map[i] = map[i - 1] + l_param
// 4. if map[i] >= r:
// 5. map[i] = map[i] - r
// This algorithm is parallelized with vector instructions by processing
// certain number of values (NUM_OF_VALS) in parallel. Therefore,
// in the beginning we need to initialize the first NUM_OF_VALS elements.
for(size_t i = 0; i < NUM_OF_VALS; i++) {
map[i] = (i * l_param) % R_BITS;
}
// Set the increment vector such that by adding it to vmap vectors
// we will obtain the next NUM_OF_VALS elements of the map.
inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
vr = SET1_I16(R_BITS);
// Load the first NUM_OF_VALS elements in the vmap vectors
for(size_t i = 0; i < NUM_ZMMS; i++) {
vmap[i] = LOAD(&map[i * WORDS_IN_ZMM]);
}
for(size_t i = NUM_ZMMS; i < (R_PADDED / WORDS_IN_ZMM); i += NUM_ZMMS) {
for(size_t j = 0; j < NUM_ZMMS; j++) {
vmap[j] = ADD_I16(vmap[j], inc);
mask[j] = CMPM_U16(vmap[j], vr, _MM_CMPINT_NLT);
vmap[j] = MSUB_I16(vmap[j], mask[j], vmap[j], vr);
STORE(&map[(i + j) * WORDS_IN_ZMM], vmap[j]);
}
}
}