_INLINE_ void generate_map()

in src/gf2x/gf2x_ksqr_avx512.c [31:72]


_INLINE_ void generate_map(OUT uint16_t *map, IN const size_t l_param)
{
  __m512i   vmap[NUM_ZMMS], vr, inc;
  __mmask32 mask[NUM_ZMMS];

  // The permutation map is generated in the following way:
  //   1. for i = 0 to map size:
  //   2.  map[i] = (i * l_param) % r
  // However, to avoid the expensive multiplication and modulo operations
  // we modify the algorithm to:
  //   1. map[0] = l_param
  //   2. for i = 1 to map size:
  //   3.   map[i] = map[i - 1] + l_param
  //   4.   if map[i] >= r:
  //   5.     map[i] = map[i] - r
  // This algorithm is parallelized with vector instructions by processing
  // certain number of values (NUM_OF_VALS) in parallel. Therefore,
  // in the beginning we need to initialize the first NUM_OF_VALS elements.
  for(size_t i = 0; i < NUM_OF_VALS; i++) {
    map[i] = (i * l_param) % R_BITS;
  }

  // Set the increment vector such that by adding it to vmap vectors
  // we will obtain the next NUM_OF_VALS elements of the map.
  inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
  vr  = SET1_I16(R_BITS);

  // Load the first NUM_OF_VALS elements in the vmap vectors
  for(size_t i = 0; i < NUM_ZMMS; i++) {
    vmap[i] = LOAD(&map[i * WORDS_IN_ZMM]);
  }

  for(size_t i = NUM_ZMMS; i < (R_PADDED / WORDS_IN_ZMM); i += NUM_ZMMS) {
    for(size_t j = 0; j < NUM_ZMMS; j++) {
      vmap[j] = ADD_I16(vmap[j], inc);
      mask[j] = CMPM_U16(vmap[j], vr, _MM_CMPINT_NLT);
      vmap[j] = MSUB_I16(vmap[j], mask[j], vmap[j], vr);

      STORE(&map[(i + j) * WORDS_IN_ZMM], vmap[j]);
    }
  }
}