void secure_set_bits_avx512()

in pq-crypto/bike_r3/sampling_avx512.c [21:93]


void secure_set_bits_avx512(OUT pad_r_t *   r,
                            IN const size_t first_pos,
                            IN const idx_t *wlist,
                            IN const size_t w_size)
{
  // The function assumes that the size of r is a multiple
  // of the cumulative size of used ZMM registers.
  assert((sizeof(*r) / sizeof(uint64_t)) % ZMMS_QWORDS == 0);

  // va vectors hold the bits of the output array "r"
  // va_pos_qw vectors hold the qw position indices of "r"
  // The algorithm works as follows:
  //   1. Initialize va_pos_qw with starting positions of qw's of "r"
  //      va_pos_qw = (7, 6, 5, 4, 3, 2, 1, 0);
  //   2. While the size of "r" is not exceeded:
  //   3.   For each w in wlist:
  //   4.     Compare the pos_qw of w with positions in va_pos_qw
  //          and for the position which is equal set the appropriate
  //          bit in va vector.
  //   5.   Set va_pos_qw to the next qw positions of "r"
  __m512i  va[NUM_ZMMS], va_pos_qw[NUM_ZMMS];
  __m512i  w_pos_qw, w_pos_bit, one, inc;
  __mmask8 va_mask;

  uint64_t *r64 = (uint64_t *)r;

  one = SET1_I64(1);
  inc = SET1_I64(QWORDS_IN_ZMM);

  // 1. Initialize
  va_pos_qw[0] = SET_I64(7, 6, 5, 4, 3, 2, 1, 0);
  for(size_t i = 1; i < NUM_ZMMS; i++) {
    va_pos_qw[i] = ADD_I64(va_pos_qw[i - 1], inc);
  }

  // va_pos_qw vectors hold qw positions 0 .. (NUM_ZMMS * QWORDS_IN_ZMM - 1)
  // Therefore, we set the increment vector inc such that by adding it to
  // va_pos_qw vectors they hold the next ZMMS_QWORDS qw positions.
  inc = SET1_I64(ZMMS_QWORDS);

  for(size_t i = 0; i < (sizeof(*r) / sizeof(uint64_t)); i += ZMMS_QWORDS) {
    for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
      va[va_iter] = SET_ZERO;
    }

    for(size_t w_iter = 0; w_iter < w_size; w_iter++) {
      int32_t w = wlist[w_iter] - first_pos;
      w_pos_qw  = SET1_I64(w >> 6);
#if (defined(__GNUC__) && ((__GNUC__ == 6) || (__GNUC__ == 5)) && !defined(__clang__)) || (defined(__clang__) && __clang_major__ == 3 && __clang_minor__ == 9)
      // Workaround for gcc-6, gcc-5, and clang < 3.9, which do not allowing the second
      // argument of SLLI to be non-immediate value.
      __m512i temp = SET1_I64(w & MASK(6));
      w_pos_bit = SLLV_I64(one, temp);
#else
      w_pos_bit = SLLI_I64(one, w & MASK(6));
#endif

      // 4. Compare the positions in va_pos_qw with w_pos_qw
      //    and set the appropriate bit in va
      for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
        va_mask     = CMPMEQ_I64(va_pos_qw[va_iter], w_pos_qw);
        va[va_iter] = MOR_I64(va[va_iter], va_mask, va[va_iter], w_pos_bit);
      }
    }

    // 5. Set the va_pos_qw to the next qw positions of r
    //    and store the previously computed data in r
    for(size_t va_iter = 0; va_iter < NUM_ZMMS; va_iter++) {
      STORE(&r64[i + (va_iter * QWORDS_IN_ZMM)], va[va_iter]);
      va_pos_qw[va_iter] = ADD_I64(va_pos_qw[va_iter], inc);
    }
  }
}