void KeccakP1600times4_ExtractAndAddLanesAll()

in src/sha3/keccak4x/KeccakP-1600-times4-SIMD256.c [380:442]


void KeccakP1600times4_ExtractAndAddLanesAll(const void *states, const unsigned char *input, unsigned char *output, unsigned int laneCount, unsigned int laneOffset)
{
    const UINT64 *curInput0 = (UINT64 *)input;
    const UINT64 *curInput1 = (UINT64 *)(input+laneOffset*1*SnP_laneLengthInBytes);
    const UINT64 *curInput2 = (UINT64 *)(input+laneOffset*2*SnP_laneLengthInBytes);
    const UINT64 *curInput3 = (UINT64 *)(input+laneOffset*3*SnP_laneLengthInBytes);
    UINT64 *curOutput0 = (UINT64 *)output;
    UINT64 *curOutput1 = (UINT64 *)(output+laneOffset*1*SnP_laneLengthInBytes);
    UINT64 *curOutput2 = (UINT64 *)(output+laneOffset*2*SnP_laneLengthInBytes);
    UINT64 *curOutput3 = (UINT64 *)(output+laneOffset*3*SnP_laneLengthInBytes);

    const V256 *stateAsLanes = (const V256 *)states;
    const UINT64 *stateAsLanes64 = (const UINT64*)states;
    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
    unsigned int i;

    #define ExtrXor( argIndex ) \
                                curOutput0[argIndex] = curInput0[argIndex] ^ stateAsLanes64[4*(argIndex)],\
                                curOutput1[argIndex] = curInput1[argIndex] ^ stateAsLanes64[4*(argIndex)+1],\
                                curOutput2[argIndex] = curInput2[argIndex] ^ stateAsLanes64[4*(argIndex)+2],\
                                curOutput3[argIndex] = curInput3[argIndex] ^ stateAsLanes64[4*(argIndex)+3]

    #define ExtrXor4( argIndex ) \
                                    lanes0 = LOAD256( stateAsLanes[argIndex+0] ),\
                                    lanes1 = LOAD256( stateAsLanes[argIndex+1] ),\
                                    lanes2 = LOAD256( stateAsLanes[argIndex+2] ),\
                                    lanes3 = LOAD256( stateAsLanes[argIndex+3] ),\
                                    UNINTLEAVE(),\
                                    lanesL01 = LOAD256u( curInput0[argIndex]),\
                                    lanesH01 = LOAD256u( curInput1[argIndex]),\
                                    lanesL23 = LOAD256u( curInput2[argIndex]),\
                                    lanesH23 = LOAD256u( curInput3[argIndex]),\
                                    XOReq256( lanes0, lanesL01 ),\
                                    XOReq256( lanes1, lanesH01 ),\
                                    XOReq256( lanes2, lanesL23 ),\
                                    XOReq256( lanes3, lanesH23 ),\
                                    STORE256u( curOutput0[argIndex], lanes0 ),\
                                    STORE256u( curOutput1[argIndex], lanes1 ),\
                                    STORE256u( curOutput2[argIndex], lanes2 ),\
                                    STORE256u( curOutput3[argIndex], lanes3 )

    if ( laneCount >= 16 )  {
        ExtrXor4( 0 );
        ExtrXor4( 4 );
        ExtrXor4( 8 );
        ExtrXor4( 12 );
        if ( laneCount >= 20 )  {
            ExtrXor4( 16 );
            for(i=20; i<laneCount; i++)
                ExtrXor( i );
        }
        else {
            for(i=16; i<laneCount; i++)
                ExtrXor( i );
        }
    }
    else {
        for(i=0; i<laneCount; i++)
            ExtrXor( i );
    }
    #undef  ExtrXor
    #undef  ExtrXor4
}