in FourQ_64bit_and_portable/table_lookup.h [25:159]
void table_lookup_1x8(point_extproj_precomp_t* table, point_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask)
{ // Constant-time table lookup to extract a point represented as (X+Y,Y-X,2Z,2dT) corresponding to extended twisted Edwards coordinates (X:Y:Z:T)
// Inputs: sign_mask, digit, table containing 8 points
// Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0
#if (SIMD_SUPPORT == AVX2_SUPPORT)
#if defined(ASM_SUPPORT)
table_lookup_1x8_a(table, P, &digit, &sign_mask);
#else
__m256i point[4], temp_point[4], full_mask;
unsigned int i;
int mask;
point[0] = _mm256_loadu_si256((__m256i*)table[0]->xy); // point = table[0]
point[1] = _mm256_loadu_si256((__m256i*)table[0]->yx);
point[2] = _mm256_loadu_si256((__m256i*)table[0]->z2);
point[3] = _mm256_loadu_si256((__m256i*)table[0]->t2);
for (i = 1; i < 8; i++)
{
digit--;
// While digit>=0 mask = 0xFF...F else mask = 0x00...0
mask = (int)(digit >> (8*sizeof(digit)-1)) - 1;
temp_point[0] = _mm256_loadu_si256((__m256i*)table[i]->xy); // temp_point = table[i]
temp_point[1] = _mm256_loadu_si256((__m256i*)table[i]->yx);
temp_point[2] = _mm256_loadu_si256((__m256i*)table[i]->z2);
temp_point[3] = _mm256_loadu_si256((__m256i*)table[i]->t2);
// If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
full_mask = _mm256_set1_epi32(mask);
temp_point[0] = _mm256_xor_si256(point[0], temp_point[0]);
temp_point[1] = _mm256_xor_si256(point[1], temp_point[1]);
temp_point[2] = _mm256_xor_si256(point[2], temp_point[2]);
temp_point[3] = _mm256_xor_si256(point[3], temp_point[3]);
point[0] = _mm256_xor_si256(_mm256_and_si256(temp_point[0], full_mask), point[0]);
point[1] = _mm256_xor_si256(_mm256_and_si256(temp_point[1], full_mask), point[1]);
point[2] = _mm256_xor_si256(_mm256_and_si256(temp_point[2], full_mask), point[2]);
point[3] = _mm256_xor_si256(_mm256_and_si256(temp_point[3], full_mask), point[3]);
}
temp_point[3] = _mm256_loadu_si256((__m256i*)point+3);
temp_point[0] = _mm256_loadu_si256((__m256i*)point+1); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate
temp_point[1] = _mm256_loadu_si256((__m256i*)point);
full_mask = _mm256_set1_epi32((int)sign_mask);
fpneg1271((digit_t*)temp_point+12); // Negate 2dt coordinate
fpneg1271((digit_t*)temp_point+14); // If sign_mask = 0 then choose negative of the point
point[0] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[0], temp_point[0]), full_mask), temp_point[0]);
point[1] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[1], temp_point[1]), full_mask), temp_point[1]);
point[3] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[3], temp_point[3]), full_mask), temp_point[3]);
_mm256_storeu_si256((__m256i*)P->xy, point[0]);
_mm256_storeu_si256((__m256i*)P->yx, point[1]);
_mm256_storeu_si256((__m256i*)P->z2, point[2]);
_mm256_storeu_si256((__m256i*)P->t2, point[3]);
#endif
#elif (SIMD_SUPPORT == AVX_SUPPORT)
__m256d point[4], temp_point[4], full_mask;
unsigned int i;
int mask;
point[0] = _mm256_loadu_pd((double const*)table[0]->xy); // point = table[0]
point[1] = _mm256_loadu_pd((double const*)table[0]->yx);
point[2] = _mm256_loadu_pd((double const*)table[0]->z2);
point[3] = _mm256_loadu_pd((double const*)table[0]->t2);
for (i = 1; i < 8; i++)
{
digit--;
// While digit>=0 mask = 0xFF...F else sign = 0x00...0
mask = (int)(digit >> (8*sizeof(digit)-1)) - 1;
full_mask = _mm256_set1_pd ((double)mask);
temp_point[0] = _mm256_loadu_pd((double const*)table[i]->xy); // temp_point = table[i]
temp_point[1] = _mm256_loadu_pd((double const*)table[i]->yx);
temp_point[2] = _mm256_loadu_pd((double const*)table[i]->z2);
temp_point[3] = _mm256_loadu_pd((double const*)table[i]->t2);
// If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
point[0] = _mm256_blendv_pd(point[0], temp_point[0], full_mask);
point[1] = _mm256_blendv_pd(point[1], temp_point[1], full_mask);
point[2] = _mm256_blendv_pd(point[2], temp_point[2], full_mask);
point[3] = _mm256_blendv_pd(point[3], temp_point[3], full_mask);
}
temp_point[3] = _mm256_loadu_pd((double const*)point+12);
temp_point[0] = _mm256_loadu_pd((double const*)point+4); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate
temp_point[1] = _mm256_loadu_pd((double const*)point);
full_mask = _mm256_set1_pd((double)((int)sign_mask));
fpneg1271((digit_t*)temp_point+12); // Negate 2dt coordinate
fpneg1271((digit_t*)temp_point+14);
point[0] = _mm256_blendv_pd(temp_point[0], point[0], full_mask); // If sign_mask = 0 then choose negative of the point
point[1] = _mm256_blendv_pd(temp_point[1], point[1], full_mask);
point[3] = _mm256_blendv_pd(temp_point[3], point[3], full_mask);
_mm256_storeu_pd((double*)P->xy, point[0]);
_mm256_storeu_pd((double*)P->yx, point[1]);
_mm256_storeu_pd((double*)P->z2, point[2]);
_mm256_storeu_pd((double*)P->t2, point[3]);
#else
point_extproj_precomp_t point, temp_point;
unsigned int i, j;
digit_t mask;
ecccopy_precomp(table[0], point); // point = table[0]
for (i = 1; i < 8; i++)
{
digit--;
// While digit>=0 mask = 0xFF...F else sign = 0x00...0
mask = (digit_t)(digit >> (8*sizeof(digit)-1)) - 1;
ecccopy_precomp(table[i], temp_point); // temp_point = table[i]
// If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
for (j = 0; j < NWORDS_FIELD; j++) {
point->xy[0][j] = (mask & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ point->xy[0][j];
point->xy[1][j] = (mask & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ point->xy[1][j];
point->yx[0][j] = (mask & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ point->yx[0][j];
point->yx[1][j] = (mask & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ point->yx[1][j];
point->z2[0][j] = (mask & (point->z2[0][j] ^ temp_point->z2[0][j])) ^ point->z2[0][j];
point->z2[1][j] = (mask & (point->z2[1][j] ^ temp_point->z2[1][j])) ^ point->z2[1][j];
point->t2[0][j] = (mask & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ point->t2[0][j];
point->t2[1][j] = (mask & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ point->t2[1][j];
}
}
fp2copy1271(point->t2, temp_point->t2);
fp2copy1271(point->xy, temp_point->yx); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate
fp2copy1271(point->yx, temp_point->xy);
fpneg1271(temp_point->t2[0]); // Negate 2dt coordinate
fpneg1271(temp_point->t2[1]);
for (j = 0; j < NWORDS_FIELD; j++) { // If sign_mask = 0 then choose negative of the point
point->xy[0][j] = ((digit_t)((int)sign_mask) & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ temp_point->xy[0][j];
point->xy[1][j] = ((digit_t)((int)sign_mask) & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ temp_point->xy[1][j];
point->yx[0][j] = ((digit_t)((int)sign_mask) & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ temp_point->yx[0][j];
point->yx[1][j] = ((digit_t)((int)sign_mask) & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ temp_point->yx[1][j];
point->t2[0][j] = ((digit_t)((int)sign_mask) & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ temp_point->t2[0][j];
point->t2[1][j] = ((digit_t)((int)sign_mask) & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ temp_point->t2[1][j];
}
ecccopy_precomp(point, P);
#endif
}