in umac.c [365:408]
static void nh_aux(void *kp, const void *dp, void *hp, UINT32 dlen)
/* Same as previous nh_aux, but two streams are handled in one pass,
* reading and writing 16 bytes of hash-state per call.
*/
{
UINT64 h1,h2;
UWORD c = dlen / 32;
UINT32 *k = (UINT32 *)kp;
const UINT32 *d = (const UINT32 *)dp;
UINT32 d0,d1,d2,d3,d4,d5,d6,d7;
UINT32 k0,k1,k2,k3,k4,k5,k6,k7,
k8,k9,k10,k11;
h1 = *((UINT64 *)hp);
h2 = *((UINT64 *)hp + 1);
k0 = *(k+0); k1 = *(k+1); k2 = *(k+2); k3 = *(k+3);
do {
d0 = LOAD_UINT32_LITTLE(d+0); d1 = LOAD_UINT32_LITTLE(d+1);
d2 = LOAD_UINT32_LITTLE(d+2); d3 = LOAD_UINT32_LITTLE(d+3);
d4 = LOAD_UINT32_LITTLE(d+4); d5 = LOAD_UINT32_LITTLE(d+5);
d6 = LOAD_UINT32_LITTLE(d+6); d7 = LOAD_UINT32_LITTLE(d+7);
k4 = *(k+4); k5 = *(k+5); k6 = *(k+6); k7 = *(k+7);
k8 = *(k+8); k9 = *(k+9); k10 = *(k+10); k11 = *(k+11);
h1 += MUL64((k0 + d0), (k4 + d4));
h2 += MUL64((k4 + d0), (k8 + d4));
h1 += MUL64((k1 + d1), (k5 + d5));
h2 += MUL64((k5 + d1), (k9 + d5));
h1 += MUL64((k2 + d2), (k6 + d6));
h2 += MUL64((k6 + d2), (k10 + d6));
h1 += MUL64((k3 + d3), (k7 + d7));
h2 += MUL64((k7 + d3), (k11 + d7));
k0 = k8; k1 = k9; k2 = k10; k3 = k11;
d += 8;
k += 8;
} while (--c);
((UINT64 *)hp)[0] = h1;
((UINT64 *)hp)[1] = h2;
}