in src/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c [380:442]
void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
{
const UINT64 *curInput0 = (UINT64 *)input;
const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
UINT64 *curOutput0 = (UINT64 *)output;
UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);
const V256 *stateAsLanes = (const V256 *)states;
const UINT64 *stateAsLanes64 = (const UINT64*)states;
V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
unsigned int i;
#define ExtrXor( argIndex ) \
curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]
#define ExtrXor4( argIndex ) \
lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
UNINTLEAVE(),\
lanesL01 = LOAD256u( curInput0[argIndex]),\
lanesH01 = LOAD256u( curInput1[argIndex]),\
lanesL23 = LOAD256u( curInput2[argIndex]),\
lanesH23 = LOAD256u( curInput3[argIndex]),\
XOReq256( lanes0, lanesL01 ),\
XOReq256( lanes1, lanesH01 ),\
XOReq256( lanes2, lanesL23 ),\
XOReq256( lanes3, lanesH23 ),\
STORE256u( curOutput0[argIndex], lanes0 ),\
STORE256u( curOutput1[argIndex], lanes1 ),\
STORE256u( curOutput2[argIndex], lanes2 ),\
STORE256u( curOutput3[argIndex], lanes3 )
if ( laneCount >= 16 ) {
ExtrXor4( 0 );
ExtrXor4( 4 );
ExtrXor4( 8 );
ExtrXor4( 12 );
if ( laneCount >= 20 ) {
ExtrXor4( 16 );
for(i=20; i<laneCount; i++)
ExtrXor( i );
}
else {
for(i=16; i<laneCount; i++)
ExtrXor( i );
}
}
else {
for(i=0; i<laneCount; i++)
ExtrXor( i );
}
#undef ExtrXor
#undef ExtrXor4
}