size_t KeccakF1600times4_FastLoop_Absorb()

in pq-crypto/kyber_r3/KeccakP-1600-times4-SIMD256_avx2.c [1071:1176]


size_t KeccakF1600times4_FastLoop_Absorb(void *states, unsigned int laneCount, unsigned int laneOffsetParallel, unsigned int laneOffsetSerial, const unsigned char *data, size_t dataByteLen)
{
    if (laneCount == 21) {
#if 0
        const unsigned char *dataStart = data;
        const UINT64 *curData0 = (const UINT64 *)data;
        const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
        const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
        const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);

        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
            V256 *stateAsLanes = (V256 *)states;
            V256 lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;
            #define Xor_In( argIndex ) \
                XOReq256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
            #define Xor_In4( argIndex ) \
                lanes0 = LOAD256u( curData0[argIndex]),\
                lanes1 = LOAD256u( curData1[argIndex]),\
                lanes2 = LOAD256u( curData2[argIndex]),\
                lanes3 = LOAD256u( curData3[argIndex]),\
                INTLEAVE(),\
                XOReq256( stateAsLanes[argIndex+0], lanes0 ),\
                XOReq256( stateAsLanes[argIndex+1], lanes1 ),\
                XOReq256( stateAsLanes[argIndex+2], lanes2 ),\
                XOReq256( stateAsLanes[argIndex+3], lanes3 )
            Xor_In4( 0 );
            Xor_In4( 4 );
            Xor_In4( 8 );
            Xor_In4( 12 );
            Xor_In4( 16 );
            Xor_In( 20 );
            #undef  Xor_In
            #undef  Xor_In4
            KeccakP1600times4_PermuteAll_24rounds(states);
            curData0 += laneOffsetSerial;
            curData1 += laneOffsetSerial;
            curData2 += laneOffsetSerial;
            curData3 += laneOffsetSerial;
            dataByteLen -= laneOffsetSerial*8;
        }
        return (const unsigned char *)curData0 - dataStart;
#else
//        unsigned int i;
        const unsigned char *dataStart = data;
        // correcting cast-align errors
        // old version: const UINT64 *curData0 = (const UINT64 *)data;
        const UINT64 *curData0 = (const void *)data;
        // old version: const UINT64 *curData1 = (const UINT64 *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
        const UINT64 *curData1 = (const void *)(data+laneOffsetParallel*1*SnP_laneLengthInBytes);
        // old version: const UINT64 *curData2 = (const UINT64 *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
        const UINT64 *curData2 = (const void *)(data+laneOffsetParallel*2*SnP_laneLengthInBytes);
        // old version: const UINT64 *curData3 = (const UINT64 *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
        const UINT64 *curData3 = (const void *)(data+laneOffsetParallel*3*SnP_laneLengthInBytes);
        V256 *statesAsLanes = (V256 *)states;
        declareABCDE

        copyFromState(A, statesAsLanes)
        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
            #define XOR_In( Xxx, argIndex ) \
                XOReq256(Xxx, LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))
            XOR_In( Aba, 0 );
            XOR_In( Abe, 1 );
            XOR_In( Abi, 2 );
            XOR_In( Abo, 3 );
            XOR_In( Abu, 4 );
            XOR_In( Aga, 5 );
            XOR_In( Age, 6 );
            XOR_In( Agi, 7 );
            XOR_In( Ago, 8 );
            XOR_In( Agu, 9 );
            XOR_In( Aka, 10 );
            XOR_In( Ake, 11 );
            XOR_In( Aki, 12 );
            XOR_In( Ako, 13 );
            XOR_In( Aku, 14 );
            XOR_In( Ama, 15 );
            XOR_In( Ame, 16 );
            XOR_In( Ami, 17 );
            XOR_In( Amo, 18 );
            XOR_In( Amu, 19 );
            XOR_In( Asa, 20 );
            #undef XOR_In
            rounds24
            curData0 += laneOffsetSerial;
            curData1 += laneOffsetSerial;
            curData2 += laneOffsetSerial;
            curData3 += laneOffsetSerial;
            dataByteLen -= laneOffsetSerial*8;
        }
        copyToState(statesAsLanes, A)
        return (const unsigned char *)curData0 - dataStart;
#endif
    }
    else {
//        unsigned int i;
        const unsigned char *dataStart = data;

        while(dataByteLen >= (laneOffsetParallel*3 + laneCount)*8) {
            KeccakP1600times4_AddLanesAll(states, data, laneCount, laneOffsetParallel);
            KeccakP1600times4_PermuteAll_24rounds(states);
            data += laneOffsetSerial*8;
            dataByteLen -= laneOffsetSerial*8;
        }
        return data - dataStart;
    }
}