sub \$16*16, $inl()

in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [558:946]


        sub \$16*16, $inl
    jmp .Lopen_sse_main_loop
.Lopen_sse_tail:
    # Handle the various tail sizes efficiently
    test $inl, $inl
    jz .Lopen_sse_finalize
    cmp \$12*16, $inl
    ja .Lopen_sse_tail_256
    cmp \$8*16, $inl
    ja .Lopen_sse_tail_192
    cmp \$4*16, $inl
    ja .Lopen_sse_tail_128\n";
###############################################################################
    # At most 64 bytes are left
    &prep_state(1); $code.="
    xor $itr2, $itr2
    mov $inl, $itr1
    cmp \$16, $itr1
    jb .Lopen_sse_tail_64_rounds
.Lopen_sse_tail_64_rounds_and_x1hash: \n";
        &poly_add("0($inp,$itr2)");
        &poly_mul(); $code.="
        sub \$16, $itr1
.Lopen_sse_tail_64_rounds:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        cmp \$16, $itr1
    jae .Lopen_sse_tail_64_rounds_and_x1hash
        cmp \$10*16, $itr2
    jne .Lopen_sse_tail_64_rounds\n";
    &finalize_state(1); $code.="
    jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_128:\n";
    # 65 - 128 bytes are left
    &prep_state(2); $code.="
    mov $inl, $itr1
    and \$-16, $itr1
    xor $itr2, $itr2
.Lopen_sse_tail_128_rounds_and_x1hash: \n";
        &poly_add("0($inp,$itr2)");
        &poly_mul(); $code.="
.Lopen_sse_tail_128_rounds:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
        cmp $itr1, $itr2
    jb .Lopen_sse_tail_128_rounds_and_x1hash
        cmp \$10*16, $itr2
    jne .Lopen_sse_tail_128_rounds\n";
    &finalize_state(2);
    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
    sub \$4*16, $inl
    lea 4*16($inp), $inp
    lea 4*16($oup), $oup
    jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_192:\n";
    # 129 - 192 bytes are left
    &prep_state(3); $code.="
    mov $inl, $itr1
    mov \$10*16, $itr2
    cmp \$10*16, $itr1
    cmovg $itr2, $itr1
    and \$-16, $itr1
    xor $itr2, $itr2
.Lopen_sse_tail_192_rounds_and_x1hash: \n";
        &poly_add("0($inp,$itr2)");
        &poly_mul(); $code.="
.Lopen_sse_tail_192_rounds:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        cmp $itr1, $itr2
    jb .Lopen_sse_tail_192_rounds_and_x1hash
        cmp \$10*16, $itr2
    jne .Lopen_sse_tail_192_rounds
    cmp \$11*16, $inl
    jb .Lopen_sse_tail_192_finish\n";
    &poly_add("10*16($inp)");
    &poly_mul(); $code.="
    cmp \$12*16, $inl
    jb .Lopen_sse_tail_192_finish\n";
    &poly_add("11*16($inp)");
    &poly_mul(); $code.="
.Lopen_sse_tail_192_finish: \n";
    &finalize_state(3);
    &xor_stream($A2, $B2, $C2, $D2, "0*16");
    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
    sub \$8*16, $inl
    lea 8*16($inp), $inp
    lea 8*16($oup), $oup
    jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_256:\n";
    # 193 - 255 bytes are left
    &prep_state(4); $code.="
    xor $itr2, $itr2
.Lopen_sse_tail_256_rounds_and_x1hash: \n";
        &poly_add("0($inp,$itr2)");
        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
        &poly_stage1();
        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
        &poly_stage2();
        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
        &poly_stage3();
        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
        &poly_reduce_stage();
        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
        add \$16, $itr2
        cmp \$10*16, $itr2
    jb .Lopen_sse_tail_256_rounds_and_x1hash

    mov $inl, $itr1
    and \$-16, $itr1
.Lopen_sse_tail_256_hash: \n";
        &poly_add("0($inp,$itr2)");
        &poly_mul(); $code.="
        add \$16, $itr2
        cmp $itr1, $itr2
    jb .Lopen_sse_tail_256_hash\n";
    &finalize_state(4);
    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
    &xor_stream($A2, $B2, $C2, $D2, "4*16");
    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
    movdqa $tmp_store, $D0
    sub \$12*16, $inl
    lea 12*16($inp), $inp
    lea 12*16($oup), $oup
###############################################################################
    # Decrypt the remaining data, 16B at a time, using existing stream
.Lopen_sse_tail_64_dec_loop:
    cmp \$16, $inl
    jb .Lopen_sse_tail_16_init
        sub \$16, $inl
        movdqu ($inp), $T0
        pxor $T0, $A0
        movdqu $A0, ($oup)
        lea 16($inp), $inp
        lea 16($oup), $oup
        movdqa $B0, $A0
        movdqa $C0, $B0
        movdqa $D0, $C0
    jmp .Lopen_sse_tail_64_dec_loop
.Lopen_sse_tail_16_init:
    movdqa $A0, $A1

    # Decrypt up to 16 bytes at the end.
.Lopen_sse_tail_16:
    test $inl, $inl
    jz .Lopen_sse_finalize

    # Read the final bytes into $T0. They need to be read in reverse order so
    # that they end up in the correct order in $T0.
    pxor $T0, $T0
    lea -1($inp,$inl), $inp
    movq $inl, $itr2
.Lopen_sse_tail_16_compose:
        pslldq \$1, $T0
        pinsrb \$0, ($inp), $T0
        sub \$1, $inp
        sub \$1, $itr2
        jnz .Lopen_sse_tail_16_compose

    movq $T0, $t0
    pextrq \$1, $T0, $t1
    # The final bytes of keystream are in $A1.
    pxor $A1, $T0

    # Copy the plaintext bytes out.
.Lopen_sse_tail_16_extract:
        pextrb \$0, $T0, ($oup)
        psrldq \$1, $T0
        add \$1, $oup
        sub \$1, $inl
    jne .Lopen_sse_tail_16_extract

    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

.Lopen_sse_finalize:\n";
    &poly_add($len_store);
    &poly_mul(); $code.="
    # Final reduce
    mov $acc0, $t0
    mov $acc1, $t1
    mov $acc2, $t2
    sub \$-5, $acc0
    sbb \$-1, $acc1
    sbb \$3, $acc2
    cmovc $t0, $acc0
    cmovc $t1, $acc1
    cmovc $t2, $acc2
    # Add in s part of the key
    add 0+$s_store, $acc0
    adc 8+$s_store, $acc1\n";

$code.="
    movaps 16*0+$xmm_store, %xmm6
    movaps 16*1+$xmm_store, %xmm7
    movaps 16*2+$xmm_store, %xmm8
    movaps 16*3+$xmm_store, %xmm9
    movaps 16*4+$xmm_store, %xmm10
    movaps 16*5+$xmm_store, %xmm11
    movaps 16*6+$xmm_store, %xmm12
    movaps 16*7+$xmm_store, %xmm13
    movaps 16*8+$xmm_store, %xmm14
    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
    add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
    # The tag replaces the key on return
    pop $keyp
.cfi_pop $keyp
    mov $acc0, ($keyp)
    mov $acc1, 8($keyp)
    pop %r15
.cfi_pop %r15
    pop %r14
.cfi_pop %r14
    pop %r13
.cfi_pop %r13
    pop %r12
.cfi_pop %r12
    pop %rbx
.cfi_pop %rbx
    pop %rbp
.cfi_pop %rbp
    ret
###############################################################################
.Lopen_sse_128:
.cfi_restore_state
    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
    movdqu 2*16($keyp), $D0
    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
    movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2
    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
    mov \$10, $acc0

.Lopen_sse_128_rounds:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
    dec $acc0
    jnz .Lopen_sse_128_rounds
    paddd .Lchacha20_consts(%rip), $A0
    paddd .Lchacha20_consts(%rip), $A1
    paddd .Lchacha20_consts(%rip), $A2
    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
    paddd $T2, $C1\npaddd $T2, $C2
    paddd $T3, $D1
    paddd .Lsse_inc(%rip), $T3
    paddd $T3, $D2
    # Clamp and store the key
    pand .Lclamp(%rip), $A0
    movdqa $A0, $r_store
    movdqa $B0, $s_store
    # Hash
    mov $adl, $itr2
    call poly_hash_ad_internal
.Lopen_sse_128_xor_hash:
        cmp \$16, $inl
        jb .Lopen_sse_tail_16
        sub \$16, $inl\n";
        # Load for hashing
        &poly_add("0*8($inp)"); $code.="
        # Load for decryption
        movdqu 0*16($inp), $T0
        pxor $T0, $A1
        movdqu $A1, 0*16($oup)
        lea 1*16($inp), $inp
        lea 1*16($oup), $oup\n";
        &poly_mul(); $code.="
        # Shift the stream left
        movdqa $B1, $A1
        movdqa $C1, $B1
        movdqa $D1, $C1
        movdqa $A2, $D1
        movdqa $B2, $A2
        movdqa $C2, $B2
        movdqa $D2, $C2
    jmp .Lopen_sse_128_xor_hash
.size GFp_chacha20_poly1305_open, .-GFp_chacha20_poly1305_open
.cfi_endproc

################################################################################
# void GFp_chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
#                                 size_t plaintext_len, const uint8_t *ad,
#                                 size_t ad_len,
#                                 union chacha20_poly1305_seal_data *data);
.globl  GFp_chacha20_poly1305_seal
.type GFp_chacha20_poly1305_seal,\@function,6
.align 64
GFp_chacha20_poly1305_seal:
.cfi_startproc
    push %rbp
.cfi_push %rbp
    push %rbx
.cfi_push %rbx
    push %r12
.cfi_push %r12
    push %r13
.cfi_push %r13
    push %r14
.cfi_push %r14
    push %r15
.cfi_push %r15
# We write the calculated authenticator back to keyp at the end, so save
# the pointer on the stack too.
    push $keyp
.cfi_push $keyp
    sub \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
    lea 32(%rsp), %rbp
    and \$-32, %rbp\n";
$code.="
    movaps %xmm6,16*0+$xmm_store
    movaps %xmm7,16*1+$xmm_store
    movaps %xmm8,16*2+$xmm_store
    movaps %xmm9,16*3+$xmm_store
    movaps %xmm10,16*4+$xmm_store
    movaps %xmm11,16*5+$xmm_store
    movaps %xmm12,16*6+$xmm_store
    movaps %xmm13,16*7+$xmm_store
    movaps %xmm14,16*8+$xmm_store
    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
$code.="
    mov 56($keyp), $inl  # extra_in_len
    addq %rdx, $inl
    mov $adl, 0+$len_store
    mov $inl, 8+$len_store
    mov %rdx, $inl\n";
$code.="
    mov GFp_ia32cap_P+8(%rip), %eax
    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
    xor \$`(1<<5) + (1<<8)`, %eax
    jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
$code.="
    cmp \$128, $inl
    jbe .Lseal_sse_128
    # For longer buffers, prepare the poly key + some stream
    movdqa .Lchacha20_consts(%rip), $A0
    movdqu 0*16($keyp), $B0
    movdqu 1*16($keyp), $C0
    movdqu 2*16($keyp), $D0

    movdqa $A0, $A1
    movdqa $A0, $A2
    movdqa $A0, $A3
    movdqa $B0, $B1
    movdqa $B0, $B2
    movdqa $B0, $B3
    movdqa $C0, $C1
    movdqa $C0, $C2
    movdqa $C0, $C3
    movdqa $D0, $D3
    paddd .Lsse_inc(%rip), $D0
    movdqa $D0, $D2
    paddd .Lsse_inc(%rip), $D0
    movdqa $D0, $D1
    paddd .Lsse_inc(%rip), $D0
    # Store on stack
    movdqa $B0, $state1_store
    movdqa $C0, $state2_store
    movdqa $D0, $ctr0_store
    movdqa $D1, $ctr1_store
    movdqa $D2, $ctr2_store
    movdqa $D3, $ctr3_store
    mov \$10, $acc0
.Lseal_sse_init_rounds:  \n";
        foreach $l (@loop_body) {$code.=$l."\n";}