_INLINE_ void generate_map()

in src/gf2x/gf2x_ksqr_avx2.c [22:77]


_INLINE_ void generate_map(OUT uint16_t *map, IN const uint16_t l_param)
{
  __m256i vmap[NUM_YMMS], vtmp[NUM_YMMS], vr, inc, zero;

  // The permutation map is generated in the following way:
  //   1. for i = 0 to map size:
  //   2.  map[i] = (i * l_param) % r
  // However, to avoid the expensive multiplication and modulo operations
  // we modify the algorithm to:
  //   1. map[0] = l_param
  //   2. for i = 1 to map size:
  //   3.   map[i] = map[i - 1] + l_param
  //   4.   if map[i] >= r:
  //   5.     map[i] = map[i] - r
  // This algorithm is parallelized with vector instructions by processing
  // certain number of values (NUM_OF_VALS) in parallel. Therefore,
  // in the beginning we need to initialize the first NUM_OF_VALS elements.
  for(size_t i = 0; i < NUM_OF_VALS; i++) {
    map[i] = (i * l_param) % R_BITS;
  }

  vr   = SET1_I16(R_BITS);
  zero = SET_ZERO;

  // Set the increment vector such that adding it to vmap vectors
  // gives the next NUM_OF_VALS elements of the map. AVX2 does not
  // support comparison of vectors where vector elements are considered
  // as unsigned integers. This is a problem when r > 2^14 because
  // sum of two values can be greater than 2^15 which would make the it
  // a negative number when considered as a signed 16-bit integer,
  // and therefore, the condition in step 4 of the algorithm would be
  // evaluated incorrectly. So, we use the following trick:
  // we subtract R from the increment and modify the algorithm:
  //   1. map[0] = l_param
  //   2. for i = 1 to map size:
  //   3.   map[i] = map[i - 1] + (l_param - r)
  //   4.   if map[i] < 0:
  //   5.     map[i] = map[i] + r
  inc = SET1_I16((l_param * NUM_OF_VALS) % R_BITS);
  inc = SUB_I16(inc, vr);

  // Load the first NUM_OF_VALS elements in the vmap vectors
  for(size_t i = 0; i < NUM_YMMS; i++) {
    vmap[i] = LOAD(&map[i * WORDS_IN_YMM]);
  }

  for(size_t i = NUM_YMMS; i < (R_PADDED / WORDS_IN_YMM); i += NUM_YMMS) {
    for(size_t j = 0; j < NUM_YMMS; j++) {
      vmap[j] = ADD_I16(vmap[j], inc);
      vtmp[j] = CMPGT_I16(zero, vmap[j]);
      vmap[j] = ADD_I16(vmap[j], vtmp[j] & vr);

      STORE(&map[(i + j) * WORDS_IN_YMM], vmap[j]);
    }
  }
}