in src/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c [123:166]
void KeccakP1600times4_AddLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
V256 *stateAsLanes = (V256 *)states;
unsigned int i;
const UINT64 *curData0 = (const UINT64 *)data;
const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
#define Xor_In( argIndex ) XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
#define Xor_In4( argIndex ) lanes0 = LOAD256u( curData0[argIndex]),\
lanes1 = LOAD256u( curData1[argIndex]),\
lanes2 = LOAD256u( curData2[argIndex]),\
lanes3 = LOAD256u( curData3[argIndex]),\
INTLEAVE(),\
XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
XOReq256( stateAsLanes[argIndex+3], lanes3 )
if ( laneCount >= 16 ) {
Xor_In4( 0 );
Xor_In4( 4 );
Xor_In4( 8 );
Xor_In4( 12 );
if ( laneCount >= 20 ) {
Xor_In4( 16 );
for(i=20; i<laneCount; i++)
Xor_In( i );
}
else {
for(i=16; i<laneCount; i++)
Xor_In( i );
}
}
else {
for(i=0; i<laneCount; i++)
Xor_In( i );
}
#undef Xor_In
#undef Xor_In4
}