void table_lookup_1x8()

in FourQ_64bit_and_portable/table_lookup.h [25:159]


void table_lookup_1x8(point_extproj_precomp_t* table, point_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask)
{ // Constant-time table lookup to extract a point represented as (X+Y,Y-X,2Z,2dT) corresponding to extended twisted Edwards coordinates (X:Y:Z:T)
  // Inputs: sign_mask, digit, table containing 8 points
  // Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0

#if (SIMD_SUPPORT == AVX2_SUPPORT)  
#if defined(ASM_SUPPORT)
    table_lookup_1x8_a(table, P, &digit, &sign_mask);
#else
    __m256i point[4], temp_point[4], full_mask; 
    unsigned int i;
    int mask;
    
    point[0] = _mm256_loadu_si256((__m256i*)table[0]->xy);              // point = table[0] 
    point[1] = _mm256_loadu_si256((__m256i*)table[0]->yx);  
    point[2] = _mm256_loadu_si256((__m256i*)table[0]->z2);  
    point[3] = _mm256_loadu_si256((__m256i*)table[0]->t2); 

    for (i = 1; i < 8; i++) 
    { 
        digit--;
        // While digit>=0 mask = 0xFF...F else mask = 0x00...0
        mask = (int)(digit >> (8*sizeof(digit)-1)) - 1;
        temp_point[0] = _mm256_loadu_si256((__m256i*)table[i]->xy);     // temp_point = table[i]
        temp_point[1] = _mm256_loadu_si256((__m256i*)table[i]->yx);
        temp_point[2] = _mm256_loadu_si256((__m256i*)table[i]->z2);
        temp_point[3] = _mm256_loadu_si256((__m256i*)table[i]->t2);
        // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
        full_mask = _mm256_set1_epi32(mask);
        temp_point[0] = _mm256_xor_si256(point[0], temp_point[0]);
        temp_point[1] = _mm256_xor_si256(point[1], temp_point[1]);
        temp_point[2] = _mm256_xor_si256(point[2], temp_point[2]);
        temp_point[3] = _mm256_xor_si256(point[3], temp_point[3]);
        point[0] = _mm256_xor_si256(_mm256_and_si256(temp_point[0], full_mask), point[0]);
        point[1] = _mm256_xor_si256(_mm256_and_si256(temp_point[1], full_mask), point[1]);
        point[2] = _mm256_xor_si256(_mm256_and_si256(temp_point[2], full_mask), point[2]);
        point[3] = _mm256_xor_si256(_mm256_and_si256(temp_point[3], full_mask), point[3]);
    }
                                
    temp_point[3] = _mm256_loadu_si256((__m256i*)point+3); 
    temp_point[0] = _mm256_loadu_si256((__m256i*)point+1);                   // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate 
    temp_point[1] = _mm256_loadu_si256((__m256i*)point);
    full_mask = _mm256_set1_epi32((int)sign_mask); 
    fpneg1271((digit_t*)temp_point+12);                                      // Negate 2dt coordinate
    fpneg1271((digit_t*)temp_point+14);                                      // If sign_mask = 0 then choose negative of the point
    point[0] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[0], temp_point[0]), full_mask), temp_point[0]);
    point[1] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[1], temp_point[1]), full_mask), temp_point[1]);
    point[3] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[3], temp_point[3]), full_mask), temp_point[3]);
    _mm256_storeu_si256((__m256i*)P->xy, point[0]);    
    _mm256_storeu_si256((__m256i*)P->yx, point[1]);     
    _mm256_storeu_si256((__m256i*)P->z2, point[2]);  
    _mm256_storeu_si256((__m256i*)P->t2, point[3]); 
#endif
#elif (SIMD_SUPPORT == AVX_SUPPORT)
    __m256d point[4], temp_point[4], full_mask; 
    unsigned int i;
    int mask;
    
    point[0] = _mm256_loadu_pd((double const*)table[0]->xy);                 // point = table[0] 
    point[1] = _mm256_loadu_pd((double const*)table[0]->yx);  
    point[2] = _mm256_loadu_pd((double const*)table[0]->z2);  
    point[3] = _mm256_loadu_pd((double const*)table[0]->t2); 

    for (i = 1; i < 8; i++) 
    { 
        digit--;
        // While digit>=0 mask = 0xFF...F else sign = 0x00...0
        mask = (int)(digit >> (8*sizeof(digit)-1)) - 1;
        full_mask = _mm256_set1_pd ((double)mask);
        temp_point[0] = _mm256_loadu_pd((double const*)table[i]->xy);        // temp_point = table[i]
        temp_point[1] = _mm256_loadu_pd((double const*)table[i]->yx);
        temp_point[2] = _mm256_loadu_pd((double const*)table[i]->z2);
        temp_point[3] = _mm256_loadu_pd((double const*)table[i]->t2);
        // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
        point[0] = _mm256_blendv_pd(point[0], temp_point[0], full_mask);     
        point[1] = _mm256_blendv_pd(point[1], temp_point[1], full_mask);  
        point[2] = _mm256_blendv_pd(point[2], temp_point[2], full_mask);  
        point[3] = _mm256_blendv_pd(point[3], temp_point[3], full_mask); 
    }
                           
    temp_point[3] = _mm256_loadu_pd((double const*)point+12); 
    temp_point[0] = _mm256_loadu_pd((double const*)point+4);                 // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate 
    temp_point[1] = _mm256_loadu_pd((double const*)point);
    full_mask = _mm256_set1_pd((double)((int)sign_mask));
    fpneg1271((digit_t*)temp_point+12);                                      // Negate 2dt coordinate
    fpneg1271((digit_t*)temp_point+14);            
    point[0] = _mm256_blendv_pd(temp_point[0], point[0], full_mask);         // If sign_mask = 0 then choose negative of the point
    point[1] = _mm256_blendv_pd(temp_point[1], point[1], full_mask);
    point[3] = _mm256_blendv_pd(temp_point[3], point[3], full_mask); 
    _mm256_storeu_pd((double*)P->xy, point[0]);    
    _mm256_storeu_pd((double*)P->yx, point[1]);     
    _mm256_storeu_pd((double*)P->z2, point[2]);  
    _mm256_storeu_pd((double*)P->t2, point[3]); 
#else
    point_extproj_precomp_t point, temp_point;
    unsigned int i, j;
    digit_t mask;
                                  
    ecccopy_precomp(table[0], point);                                        // point = table[0]

    for (i = 1; i < 8; i++)
    {
        digit--;
        // While digit>=0 mask = 0xFF...F else sign = 0x00...0
        mask = (digit_t)(digit >> (8*sizeof(digit)-1)) - 1;
        ecccopy_precomp(table[i], temp_point);                               // temp_point = table[i] 
        // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point
        for (j = 0; j < NWORDS_FIELD; j++) {
            point->xy[0][j] = (mask & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ point->xy[0][j];
            point->xy[1][j] = (mask & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ point->xy[1][j];
            point->yx[0][j] = (mask & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ point->yx[0][j];
            point->yx[1][j] = (mask & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ point->yx[1][j];
            point->z2[0][j] = (mask & (point->z2[0][j] ^ temp_point->z2[0][j])) ^ point->z2[0][j];
            point->z2[1][j] = (mask & (point->z2[1][j] ^ temp_point->z2[1][j])) ^ point->z2[1][j];
            point->t2[0][j] = (mask & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ point->t2[0][j];
            point->t2[1][j] = (mask & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ point->t2[1][j];
        }
    }
    
    fp2copy1271(point->t2, temp_point->t2);
    fp2copy1271(point->xy, temp_point->yx);                                  // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate
    fp2copy1271(point->yx, temp_point->xy);                                   
    fpneg1271(temp_point->t2[0]);                                            // Negate 2dt coordinate
    fpneg1271(temp_point->t2[1]);             
    for (j = 0; j < NWORDS_FIELD; j++) {                                     // If sign_mask = 0 then choose negative of the point
        point->xy[0][j] = ((digit_t)((int)sign_mask) & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ temp_point->xy[0][j];
        point->xy[1][j] = ((digit_t)((int)sign_mask) & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ temp_point->xy[1][j];
        point->yx[0][j] = ((digit_t)((int)sign_mask) & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ temp_point->yx[0][j];
        point->yx[1][j] = ((digit_t)((int)sign_mask) & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ temp_point->yx[1][j];
        point->t2[0][j] = ((digit_t)((int)sign_mask) & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ temp_point->t2[0][j];
        point->t2[1][j] = ((digit_t)((int)sign_mask) & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ temp_point->t2[1][j];
    }                                  
    ecccopy_precomp(point, P); 
#endif
}