in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [1925:2157]
sub \$12*32, $itr1
and \$-16, $itr1
.Lopen_avx2_tail_512_hash:
test $itr1, $itr1
je .Lopen_avx2_tail_512_done\n";
&poly_add("0*8($itr2)");
&poly_mul_mulx(); $code.="
lea 2*8($itr2), $itr2
sub \$2*8, $itr1
jmp .Lopen_avx2_tail_512_hash
.Lopen_avx2_tail_512_done: \n";
&finalize_state_avx2(4); $code.="
vmovdqa $A0, $tmp_store\n";
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
vmovdqa $tmp_store, $A0\n";
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
&finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
lea 12*32($inp), $inp
lea 12*32($oup), $oup
sub \$12*32, $inl
.Lopen_avx2_tail_128_xor:
cmp \$32, $inl
jb .Lopen_avx2_tail_32_xor
sub \$32, $inl
vpxor ($inp), $A0, $A0
vmovdqu $A0, ($oup)
lea 1*32($inp), $inp
lea 1*32($oup), $oup
vmovdqa $B0, $A0
vmovdqa $C0, $B0
vmovdqa $D0, $C0
jmp .Lopen_avx2_tail_128_xor
.Lopen_avx2_tail_32_xor:
cmp \$16, $inl
vmovdqa $A0x, $A1x
jb .Lopen_avx2_exit
sub \$16, $inl
#load for decryption
vpxor ($inp), $A0x, $A1x
vmovdqu $A1x, ($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup
vperm2i128 \$0x11, $A0, $A0, $A0
vmovdqa $A0x, $A1x
.Lopen_avx2_exit:
vzeroupper
jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_192:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vmovdqa $D0, $T2
vmovdqa $D1, $T3
mov \$10, $acc0
.Lopen_avx2_192_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
dec $acc0
jne .Lopen_avx2_192_rounds
vpaddd $A2, $A0, $A0
vpaddd $A2, $A1, $A1
vpaddd $B2, $B0, $B0
vpaddd $B2, $B1, $B1
vpaddd $C2, $C0, $C0
vpaddd $C2, $C1, $C1
vpaddd $T2, $D0, $D0
vpaddd $T3, $D1, $D1
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 192 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
.Lopen_avx2_short:
mov $adl, $itr2
call poly_hash_ad_internal
.Lopen_avx2_short_hash_and_xor_loop:
cmp \$32, $inl
jb .Lopen_avx2_short_tail_32
sub \$32, $inl\n";
# Load + hash
&poly_add("0*8($inp)");
&poly_mul();
&poly_add("2*8($inp)");
&poly_mul(); $code.="
# Load + decrypt
vpxor ($inp), $A0, $A0
vmovdqu $A0, ($oup)
lea 1*32($inp), $inp
lea 1*32($oup), $oup
# Shift stream
vmovdqa $B0, $A0
vmovdqa $C0, $B0
vmovdqa $D0, $C0
vmovdqa $A1, $D0
vmovdqa $B1, $A1
vmovdqa $C1, $B1
vmovdqa $D1, $C1
vmovdqa $A2, $D1
vmovdqa $B2, $A2
jmp .Lopen_avx2_short_hash_and_xor_loop
.Lopen_avx2_short_tail_32:
cmp \$16, $inl
vmovdqa $A0x, $A1x
jb .Lopen_avx2_short_tail_32_exit
sub \$16, $inl\n";
&poly_add("0*8($inp)");
&poly_mul(); $code.="
vpxor ($inp), $A0x, $A3x
vmovdqu $A3x, ($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup
vextracti128 \$1, $A0, $A1x
.Lopen_avx2_short_tail_32_exit:
vzeroupper
jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_320:
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vpaddd .Lavx2_inc(%rip), $D0, $D1
vpaddd .Lavx2_inc(%rip), $D1, $D2
vmovdqa $B0, $T1
vmovdqa $C0, $T2
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
mov \$10, $acc0
.Lopen_avx2_320_rounds: \n";
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jne .Lopen_avx2_320_rounds
vpaddd .Lchacha20_consts(%rip), $A0, $A0
vpaddd .Lchacha20_consts(%rip), $A1, $A1
vpaddd .Lchacha20_consts(%rip), $A2, $A2
vpaddd $T1, $B0, $B0
vpaddd $T1, $B1, $B1
vpaddd $T1, $B2, $B2
vpaddd $T2, $C0, $C0
vpaddd $T2, $C1, $C1
vpaddd $T2, $C2, $C2
vpaddd $ctr0_store, $D0, $D0
vpaddd $ctr1_store, $D1, $D1
vpaddd $ctr2_store, $D2, $D2
vperm2i128 \$0x02, $A0, $B0, $T0
# Clamp and store the key
vpand .Lclamp(%rip), $T0, $T0
vmovdqa $T0, $r_store
# Stream for up to 320 bytes
vperm2i128 \$0x13, $A0, $B0, $A0
vperm2i128 \$0x13, $C0, $D0, $B0
vperm2i128 \$0x02, $A1, $B1, $C0
vperm2i128 \$0x02, $C1, $D1, $D0
vperm2i128 \$0x13, $A1, $B1, $A1
vperm2i128 \$0x13, $C1, $D1, $B1
vperm2i128 \$0x02, $A2, $B2, $C1
vperm2i128 \$0x02, $C2, $D2, $D1
vperm2i128 \$0x13, $A2, $B2, $A2
vperm2i128 \$0x13, $C2, $D2, $B2
jmp .Lopen_avx2_short
.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
.cfi_endproc
###############################################################################
###############################################################################
.type chacha20_poly1305_seal_avx2,\@abi-omnipotent
.align 64
chacha20_poly1305_seal_avx2:
.cfi_startproc
# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
.cfi_push %rbp
.cfi_push %rbx
.cfi_push %r12
.cfi_push %r13
.cfi_push %r14
.cfi_push %r15
.cfi_push $keyp
.cfi_adjust_cfa_offset 288 + 32
vzeroupper
vmovdqa .Lchacha20_consts(%rip), $A0
vbroadcasti128 0*16($keyp), $B0
vbroadcasti128 1*16($keyp), $C0
vbroadcasti128 2*16($keyp), $D0
vpaddd .Lavx2_init(%rip), $D0, $D0
cmp \$6*32, $inl
jbe .Lseal_avx2_192
cmp \$10*32, $inl
jbe .Lseal_avx2_320
vmovdqa $A0, $A1
vmovdqa $A0, $A2
vmovdqa $A0, $A3
vmovdqa $B0, $B1
vmovdqa $B0, $B2
vmovdqa $B0, $B3
vmovdqa $B0, $state1_store
vmovdqa $C0, $C1
vmovdqa $C0, $C2
vmovdqa $C0, $C3
vmovdqa $C0, $state2_store
vmovdqa $D0, $D3
vpaddd .Lavx2_inc(%rip), $D3, $D2
vpaddd .Lavx2_inc(%rip), $D2, $D1
vpaddd .Lavx2_inc(%rip), $D1, $D0
vmovdqa $D0, $ctr0_store
vmovdqa $D1, $ctr1_store
vmovdqa $D2, $ctr2_store
vmovdqa $D3, $ctr3_store
mov \$10, $acc0
.Lseal_avx2_init_rounds: \n";
foreach $l (@loop_body) {$code.=$l."\n";}