in pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c [1071:1176]
size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
if (laneCount == 21) {
#if 0
const unsigned char *dataStart = data;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
V256 *stateAsLanes = (V256 *)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) \
XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) \
lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
XOReq256( stateAsLanes[argIndex+3], lanes3 )
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
Xor_In4( 16 );
Xor_In( 20 );
#undef Xor_In
#undef Xor_In4
KeccakP1600times4_PermuteAll_24rounds(states);
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
return (const unsigned char *)curData0 - dataStart;
#else
// unsigned int i;
const unsigned char *dataStart = data;
// correcting cast-align errors
// old version: const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData0 = (const void *)data;
// old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
// old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
// old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
V256 *statesAsLanes = (V256 *)states;
declareABCDE
copyFromState(A, statesAsLanes)
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
#define XOR_In( Xxx, argIndex ) \
XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
XOR_In( Aba, 0 );
XOR_In( Abe, 1 );
XOR_In( Abi, 2 );
XOR_In( Abo, 3 );
XOR_In( Abu, 4 );
XOR_In( Aga, 5 );
XOR_In( Age, 6 );
XOR_In( Agi, 7 );
XOR_In( Ago, 8 );
XOR_In( Agu, 9 );
XOR_In( Aka, 10 );
XOR_In( Ake, 11 );
XOR_In( Aki, 12 );
XOR_In( Ako, 13 );
XOR_In( Aku, 14 );
XOR_In( Ama, 15 );
XOR_In( Ame, 16 );
XOR_In( Ami, 17 );
XOR_In( Amo, 18 );
XOR_In( Amu, 19 );
XOR_In( Asa, 20 );
#undef XOR_In
rounds24
curData0 += laneOffsetSerial;
curData1 += laneOffsetSerial;
curData2 += laneOffsetSerial;
curData3 += laneOffsetSerial;
dataByteLen -= laneOffsetSerial*8;
}
copyToState(statesAsLanes, A)
return (const unsigned char *)curData0 - dataStart;
#endif
}
else {
// unsigned int i;
const unsigned char *dataStart = data;
while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
KeccakP1600times4_PermuteAll_24rounds(states);
data += laneOffsetSerial*8;
dataByteLen -= laneOffsetSerial*8;
}
return data - dataStart;
}
}