sub \$12*32, $itr1()

in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [1925:2157]


    sub \$12*32, $itr1
    and \$-16, $itr1
.Lopen_avx2_tail_512_hash:
        test $itr1, $itr1
        je .Lopen_avx2_tail_512_done\n";
        &poly_add("0*8($itr2)");
        &poly_mul_mulx(); $code.="
        lea 2*8($itr2), $itr2
        sub \$2*8, $itr1
    jmp .Lopen_avx2_tail_512_hash
.Lopen_avx2_tail_512_done: \n";
    &finalize_state_avx2(4); $code.="
    vmovdqa $A0, $tmp_store\n";
    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
    vmovdqa $tmp_store, $A0\n";
    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
    lea 12*32($inp), $inp
    lea 12*32($oup), $oup
    sub \$12*32, $inl
.Lopen_avx2_tail_128_xor:
    cmp \$32, $inl
    jb .Lopen_avx2_tail_32_xor
        sub \$32, $inl
        vpxor ($inp), $A0, $A0
        vmovdqu $A0, ($oup)
        lea 1*32($inp), $inp
        lea 1*32($oup), $oup
        vmovdqa $B0, $A0
        vmovdqa $C0, $B0
        vmovdqa $D0, $C0
    jmp .Lopen_avx2_tail_128_xor
.Lopen_avx2_tail_32_xor:
    cmp \$16, $inl
    vmovdqa $A0x, $A1x
    jb .Lopen_avx2_exit
    sub \$16, $inl
    #load for decryption
    vpxor ($inp), $A0x, $A1x
    vmovdqu $A1x, ($oup)
    lea 1*16($inp), $inp
    lea 1*16($oup), $oup
    vperm2i128 \$0x11, $A0, $A0, $A0
    vmovdqa $A0x, $A1x
.Lopen_avx2_exit:
    vzeroupper
    jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_192:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .Lavx2_inc(%rip), $D0, $D1
    vmovdqa $D0, $T2
    vmovdqa $D1, $T3
    mov \$10, $acc0
.Lopen_avx2_192_rounds: \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
        dec $acc0
    jne .Lopen_avx2_192_rounds
    vpaddd $A2, $A0, $A0
    vpaddd $A2, $A1, $A1
    vpaddd $B2, $B0, $B0
    vpaddd $B2, $B1, $B1
    vpaddd $C2, $C0, $C0
    vpaddd $C2, $C1, $C1
    vpaddd $T2, $D0, $D0
    vpaddd $T3, $D1, $D1
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .Lclamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 192 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
.Lopen_avx2_short:
    mov $adl, $itr2
    call poly_hash_ad_internal
.Lopen_avx2_short_hash_and_xor_loop:
        cmp \$32, $inl
        jb .Lopen_avx2_short_tail_32
        sub \$32, $inl\n";
        # Load + hash
        &poly_add("0*8($inp)");
        &poly_mul();
        &poly_add("2*8($inp)");
        &poly_mul(); $code.="
        # Load + decrypt
        vpxor ($inp), $A0, $A0
        vmovdqu $A0, ($oup)
        lea 1*32($inp), $inp
        lea 1*32($oup), $oup
        # Shift stream
        vmovdqa $B0, $A0
        vmovdqa $C0, $B0
        vmovdqa $D0, $C0
        vmovdqa $A1, $D0
        vmovdqa $B1, $A1
        vmovdqa $C1, $B1
        vmovdqa $D1, $C1
        vmovdqa $A2, $D1
        vmovdqa $B2, $A2
    jmp .Lopen_avx2_short_hash_and_xor_loop
.Lopen_avx2_short_tail_32:
    cmp \$16, $inl
    vmovdqa $A0x, $A1x
    jb .Lopen_avx2_short_tail_32_exit
    sub \$16, $inl\n";
    &poly_add("0*8($inp)");
    &poly_mul(); $code.="
    vpxor ($inp), $A0x, $A3x
    vmovdqu $A3x, ($oup)
    lea 1*16($inp), $inp
    lea 1*16($oup), $oup
    vextracti128 \$1, $A0, $A1x
.Lopen_avx2_short_tail_32_exit:
    vzeroupper
    jmp .Lopen_sse_tail_16
###############################################################################
.Lopen_avx2_320:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .Lavx2_inc(%rip), $D0, $D1
    vpaddd .Lavx2_inc(%rip), $D1, $D2
    vmovdqa $B0, $T1
    vmovdqa $C0, $T2
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
    mov \$10, $acc0
.Lopen_avx2_320_rounds:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        dec $acc0
    jne .Lopen_avx2_320_rounds
    vpaddd .Lchacha20_consts(%rip), $A0, $A0
    vpaddd .Lchacha20_consts(%rip), $A1, $A1
    vpaddd .Lchacha20_consts(%rip), $A2, $A2
    vpaddd $T1, $B0, $B0
    vpaddd $T1, $B1, $B1
    vpaddd $T1, $B2, $B2
    vpaddd $T2, $C0, $C0
    vpaddd $T2, $C1, $C1
    vpaddd $T2, $C2, $C2
    vpaddd $ctr0_store, $D0, $D0
    vpaddd $ctr1_store, $D1, $D1
    vpaddd $ctr2_store, $D2, $D2
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .Lclamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 320 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
    vperm2i128 \$0x02, $A2, $B2, $C1
    vperm2i128 \$0x02, $C2, $D2, $D1
    vperm2i128 \$0x13, $A2, $B2, $A2
    vperm2i128 \$0x13, $C2, $D2, $B2
    jmp .Lopen_avx2_short
.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
.cfi_endproc
###############################################################################
###############################################################################
.type chacha20_poly1305_seal_avx2,\@abi-omnipotent
.align 64
chacha20_poly1305_seal_avx2:
.cfi_startproc

# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
.cfi_push %rbp
.cfi_push %rbx
.cfi_push %r12
.cfi_push %r13
.cfi_push %r14
.cfi_push %r15
.cfi_push $keyp
.cfi_adjust_cfa_offset 288 + 32

    vzeroupper
    vmovdqa .Lchacha20_consts(%rip), $A0
    vbroadcasti128 0*16($keyp), $B0
    vbroadcasti128 1*16($keyp), $C0
    vbroadcasti128 2*16($keyp), $D0
    vpaddd .Lavx2_init(%rip), $D0, $D0
    cmp \$6*32, $inl
    jbe .Lseal_avx2_192
    cmp \$10*32, $inl
    jbe .Lseal_avx2_320
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $A0, $A3
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $B0, $B3
    vmovdqa $B0, $state1_store
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vmovdqa $C0, $C3
    vmovdqa $C0, $state2_store
    vmovdqa $D0, $D3
    vpaddd .Lavx2_inc(%rip), $D3, $D2
    vpaddd .Lavx2_inc(%rip), $D2, $D1
    vpaddd .Lavx2_inc(%rip), $D1, $D0
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
    vmovdqa $D3, $ctr3_store
    mov \$10, $acc0
.Lseal_avx2_init_rounds: \n";
        foreach $l (@loop_body) {$code.=$l."\n";}