in src/gf2x/gf2x_ksqr_avx2.c [22:77]
_INLINE_ void generate_map(OUT uint16_t *map, IN const uint16_t l_param)
{
__m256i vmap[NUM_YMMS], vtmp[NUM_YMMS], vr, inc, zero;
// The permutation map is generated in the following way:
// 1. for i = 0 to map size:
// 2. map[i] = (i * l_param) % r
// However, to avoid the expensive multiplication and modulo operations
// we modify the algorithm to:
// 1. map[0] = l_param
// 2. for i = 1 to map size:
// 3. map[i] = map[i - 1] + l_param
// 4. if map[i] >= r:
// 5. map[i] = map[i] - r
// This algorithm is parallelized with vector instructions by processing
// certain number of values (NUM_OF_VALS) in parallel. Therefore,
// in the beginning we need to initialize the first NUM_OF_VALS elements.
for(size_t i = 0; i < NUM_OF_VALS; i++) {
map[i] = (i * l_param) % R_BITS;
}
vr = SET1_I16(R_BITS);
zero = SET_ZERO;
// Set the increment vector such that adding it to vmap vectors
// gives the next NUM_OF_VALS elements of the map. AVX2 does not
// support comparison of vectors where vector elements are considered
// as unsigned integers. This is a problem when r > 2^14 because
// sum of two values can be greater than 2^15 which would make the it
// a negative number when considered as a signed 16-bit integer,
// and therefore, the condition in step 4 of the algorithm would be
// evaluated incorrectly. So, we use the following trick:
// we subtract R from the increment and modify the algorithm:
// 1. map[0] = l_param
// 2. for i = 1 to map size:
// 3. map[i] = map[i - 1] + (l_param - r)
// 4. if map[i] < 0:
// 5. map[i] = map[i] + r
inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
inc = SUB_I16(inc, vr);
// Load the first NUM_OF_VALS elements in the vmap vectors
for(size_t i = 0; i < NUM_YMMS; i++) {
vmap[i] = LOAD(&map[i * WORDS_IN_YMM]);
}
for(size_t i = NUM_YMMS; i < (R_PADDED / WORDS_IN_YMM); i += NUM_YMMS) {
for(size_t j = 0; j < NUM_YMMS; j++) {
vmap[j] = ADD_I16(vmap[j], inc);
vtmp[j] = CMPGT_I16(zero, vmap[j]);
vmap[j] = ADD_I16(vmap[j], vtmp[j] & vr);
STORE(&map[(i + j) * WORDS_IN_YMM], vmap[j]);
}
}
}