sub \$12*16, $inl()

in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [1016:1373]


        sub \$12*16, $inl
        lea 12*16($inp), $inp
        jmp .Lseal_sse_128_tail_hash
.Lseal_sse_main_loop_xor: \n";
        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
        lea 16*16($inp), $inp
        sub \$16*16, $inl
        mov \$6, $itr1
        mov \$4, $itr2
        cmp \$12*16, $inl
    jg .Lseal_sse_main_loop
    mov $inl, $itr1
    test $inl, $inl
    je .Lseal_sse_128_tail_hash
    mov \$6, $itr1
    cmp \$8*16, $inl
    ja .Lseal_sse_tail_192
    cmp \$4*16, $inl
    ja .Lseal_sse_tail_128
###############################################################################
.Lseal_sse_tail_64: \n";
    &prep_state(1); $code.="
.Lseal_sse_tail_64_rounds_and_x2hash: \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
.Lseal_sse_tail_64_rounds_and_x1hash: \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg .Lseal_sse_tail_64_rounds_and_x2hash
    dec $itr2
    jge .Lseal_sse_tail_64_rounds_and_x1hash\n";
    &finalize_state(1); $code.="
    jmp .Lseal_sse_128_tail_xor
###############################################################################
.Lseal_sse_tail_128:\n";
    &prep_state(2); $code.="
.Lseal_sse_tail_128_rounds_and_x2hash: \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
.Lseal_sse_tail_128_rounds_and_x1hash: \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &poly_add("0($oup)");
        &poly_mul();
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg .Lseal_sse_tail_128_rounds_and_x2hash
    dec $itr2
    jge .Lseal_sse_tail_128_rounds_and_x1hash\n";
    &finalize_state(2);
    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
    mov \$4*16, $itr1
    sub \$4*16, $inl
    lea 4*16($inp), $inp
    jmp .Lseal_sse_128_tail_hash
###############################################################################
.Lseal_sse_tail_192:\n";
    &prep_state(3); $code.="
.Lseal_sse_tail_192_rounds_and_x2hash: \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
.Lseal_sse_tail_192_rounds_and_x1hash: \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &poly_add("0($oup)");
        &poly_mul();
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg .Lseal_sse_tail_192_rounds_and_x2hash
    dec $itr2
    jge .Lseal_sse_tail_192_rounds_and_x1hash\n";
    &finalize_state(3);
    &xor_stream($A2,$B2,$C2,$D2,0*16);
    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
    mov \$8*16, $itr1
    sub \$8*16, $inl
    lea 8*16($inp), $inp
###############################################################################
.Lseal_sse_128_tail_hash:
        cmp \$16, $itr1
        jb .Lseal_sse_128_tail_xor\n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        sub \$16, $itr1
        lea 16($oup), $oup
    jmp .Lseal_sse_128_tail_hash

.Lseal_sse_128_tail_xor:
        cmp \$16, $inl
        jb .Lseal_sse_tail_16
        sub \$16, $inl
        # Load for decryption
        movdqu 0*16($inp), $T0
        pxor $T0, $A0
        movdqu $A0, 0*16($oup)
        # Then hash
        add 0*8($oup), $acc0
        adc 1*8($oup), $acc1
        adc \$1, $acc2
        lea 1*16($inp), $inp
        lea 1*16($oup), $oup\n";
        &poly_mul(); $code.="
        # Shift the stream left
        movdqa $B0, $A0
        movdqa $C0, $B0
        movdqa $D0, $C0
        movdqa $A1, $D0
        movdqa $B1, $A1
        movdqa $C1, $B1
        movdqa $D1, $C1
    jmp .Lseal_sse_128_tail_xor

.Lseal_sse_tail_16:
    test $inl, $inl
    jz .Lprocess_blocks_of_extra_in
    # We can only load the PT one byte at a time to avoid buffer overread
    mov $inl, $itr2
    mov $inl, $itr1
    lea -1($inp,$inl), $inp
    pxor $T3, $T3
.Lseal_sse_tail_16_compose:
        pslldq \$1, $T3
        pinsrb \$0, ($inp), $T3
        lea -1($inp), $inp
        dec $itr1
        jne .Lseal_sse_tail_16_compose

    # XOR the keystream with the plaintext.
    pxor $A0, $T3

    # Write ciphertext out, byte-by-byte.
    movq $inl, $itr1
    movdqu $T3, $A0
.Lseal_sse_tail_16_extract:
        pextrb \$0, $A0, ($oup)
        psrldq \$1, $A0
        add \$1, $oup
        sub \$1, $itr1
        jnz .Lseal_sse_tail_16_extract

    # $T3 contains the final (partial, non-empty) block of ciphertext which
    # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
    # are valid. We need to fill it with extra_in bytes until full, or until we
    # run out of bytes.
    #
    # $keyp points to the tag output, which is actually a struct with the
    # extra_in pointer and length at offset 48.
    movq 288 + $xmm_storage + 32(%rsp), $keyp
    movq 56($keyp), $t1  # extra_in_len
    movq 48($keyp), $t0  # extra_in
    test $t1, $t1
    jz .Lprocess_partial_block  # Common case: no bytes of extra_in

    movq \$16, $t2
    subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
    cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
                    # (note that AT&T syntax reverses the arguments)
    jge .Lload_extra_in
    movq $t1, $t2

.Lload_extra_in:
    # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
    # into $T3. They are loaded in reverse order.
    leaq -1($t0,$t2), $inp
    # Update extra_in and extra_in_len to reflect the bytes that are about to
    # be read.
    addq $t2, $t0
    subq $t2, $t1
    movq $t0, 48($keyp)
    movq $t1, 56($keyp)

    # Update $itr2, which is used to select the mask later on, to reflect the
    # extra bytes about to be added.
    addq $t2, $itr2

    # Load $t2 bytes of extra_in into $T2.
    pxor $T2, $T2
.Lload_extra_load_loop:
        pslldq \$1, $T2
        pinsrb \$0, ($inp), $T2
        lea -1($inp), $inp
        sub \$1, $t2
        jnz .Lload_extra_load_loop

    # Shift $T2 up the length of the remainder from the main encryption. Sadly,
    # the shift for an XMM register has to be a constant, thus we loop to do
    # this.
    movq $inl, $t2

.Lload_extra_shift_loop:
        pslldq \$1, $T2
        sub \$1, $t2
        jnz .Lload_extra_shift_loop

    # Mask $T3 (the remainder from the main encryption) so that superfluous
    # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
    # disjoint and so we can merge them with an OR.
    lea .Land_masks(%rip), $t2
    shl \$4, $inl
    pand -16($t2,$inl), $T3

    # Merge $T2 into $T3, forming the remainder block.
    por $T2, $T3

    # The block of ciphertext + extra_in is ready to be included in the
    # Poly1305 state.
    movq $T3, $t0
    pextrq \$1, $T3, $t1
    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

.Lprocess_blocks_of_extra_in:
    # There may be additional bytes of extra_in to process.
    movq 288+32+$xmm_storage (%rsp), $keyp
    movq 48($keyp), $inp   # extra_in
    movq 56($keyp), $itr2  # extra_in_len
    movq $itr2, $itr1
    shr \$4, $itr2         # number of blocks

.Lprocess_extra_hash_loop:
        jz process_extra_in_trailer\n";
        &poly_add("0($inp)");
        &poly_mul(); $code.="
        leaq 16($inp), $inp
        subq \$1, $itr2
        jmp .Lprocess_extra_hash_loop
process_extra_in_trailer:
    andq \$15, $itr1       # remaining num bytes (<16) of extra_in
    movq $itr1, $inl
    jz .Ldo_length_block
    leaq -1($inp,$itr1), $inp

.Lprocess_extra_in_trailer_load:
        pslldq \$1, $T3
        pinsrb \$0, ($inp), $T3
        lea -1($inp), $inp
        sub \$1, $itr1
        jnz .Lprocess_extra_in_trailer_load

.Lprocess_partial_block:
    # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
    lea .Land_masks(%rip), $t2
    shl \$4, $inl
    pand -16($t2,$inl), $T3
    movq $T3, $t0
    pextrq \$1, $T3, $t1
    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

.Ldo_length_block:\n";
    &poly_add($len_store);
    &poly_mul(); $code.="
    # Final reduce
    mov $acc0, $t0
    mov $acc1, $t1
    mov $acc2, $t2
    sub \$-5, $acc0
    sbb \$-1, $acc1
    sbb \$3, $acc2
    cmovc $t0, $acc0
    cmovc $t1, $acc1
    cmovc $t2, $acc2
    # Add in s part of the key
    add 0+$s_store, $acc0
    adc 8+$s_store, $acc1\n";

$code.="
    movaps 16*0+$xmm_store, %xmm6
    movaps 16*1+$xmm_store, %xmm7
    movaps 16*2+$xmm_store, %xmm8
    movaps 16*3+$xmm_store, %xmm9
    movaps 16*4+$xmm_store, %xmm10
    movaps 16*5+$xmm_store, %xmm11
    movaps 16*6+$xmm_store, %xmm12
    movaps 16*7+$xmm_store, %xmm13
    movaps 16*8+$xmm_store, %xmm14
    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
    add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
    # The tag replaces the key on return
    pop $keyp
.cfi_pop $keyp
    mov $acc0, ($keyp)
    mov $acc1, 8($keyp)
    pop %r15
.cfi_pop %r15
    pop %r14
.cfi_pop %r14
    pop %r13
.cfi_pop %r13
    pop %r12
.cfi_pop %r12
    pop %rbx
.cfi_pop %rbx
    pop %rbp
.cfi_pop %rbp
    ret
################################################################################
.Lseal_sse_128:
.cfi_restore_state
    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
    movdqu 2*16($keyp), $D2
    movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0
    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
    mov \$10, $acc0

.Lseal_sse_128_rounds:\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        dec $acc0
    jnz .Lseal_sse_128_rounds
    paddd .Lchacha20_consts(%rip), $A0
    paddd .Lchacha20_consts(%rip), $A1
    paddd .Lchacha20_consts(%rip), $A2
    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
    paddd $T2, $C0\npaddd $T2, $C1
    paddd $T3, $D0
    paddd .Lsse_inc(%rip), $T3
    paddd $T3, $D1
    # Clamp and store the key
    pand .Lclamp(%rip), $A2
    movdqa $A2, $r_store
    movdqa $B2, $s_store
    # Hash
    mov %r8, $itr2
    call poly_hash_ad_internal
    jmp .Lseal_sse_128_tail_xor
.size GFp_chacha20_poly1305_seal, .-GFp_chacha20_poly1305_seal
.cfi_endproc\n";
}

if ($avx>1) {