in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [558:946]
sub \$16*16, $inl
jmp .Lopen_sse_main_loop
.Lopen_sse_tail:
# Handle the various tail sizes efficiently
test $inl, $inl
jz .Lopen_sse_finalize
cmp \$12*16, $inl
ja .Lopen_sse_tail_256
cmp \$8*16, $inl
ja .Lopen_sse_tail_192
cmp \$4*16, $inl
ja .Lopen_sse_tail_128\n";
###############################################################################
# At most 64 bytes are left
&prep_state(1); $code.="
xor $itr2, $itr2
mov $inl, $itr1
cmp \$16, $itr1
jb .Lopen_sse_tail_64_rounds
.Lopen_sse_tail_64_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
sub \$16, $itr1
.Lopen_sse_tail_64_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
cmp \$16, $itr1
jae .Lopen_sse_tail_64_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_64_rounds\n";
&finalize_state(1); $code.="
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_128:\n";
# 65 - 128 bytes are left
&prep_state(2); $code.="
mov $inl, $itr1
and \$-16, $itr1
xor $itr2, $itr2
.Lopen_sse_tail_128_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
.Lopen_sse_tail_128_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
cmp $itr1, $itr2
jb .Lopen_sse_tail_128_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_128_rounds\n";
&finalize_state(2);
&xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
sub \$4*16, $inl
lea 4*16($inp), $inp
lea 4*16($oup), $oup
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_192:\n";
# 129 - 192 bytes are left
&prep_state(3); $code.="
mov $inl, $itr1
mov \$10*16, $itr2
cmp \$10*16, $itr1
cmovg $itr2, $itr1
and \$-16, $itr1
xor $itr2, $itr2
.Lopen_sse_tail_192_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
.Lopen_sse_tail_192_rounds:
add \$16, $itr2\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
cmp $itr1, $itr2
jb .Lopen_sse_tail_192_rounds_and_x1hash
cmp \$10*16, $itr2
jne .Lopen_sse_tail_192_rounds
cmp \$11*16, $inl
jb .Lopen_sse_tail_192_finish\n";
&poly_add("10*16($inp)");
&poly_mul(); $code.="
cmp \$12*16, $inl
jb .Lopen_sse_tail_192_finish\n";
&poly_add("11*16($inp)");
&poly_mul(); $code.="
.Lopen_sse_tail_192_finish: \n";
&finalize_state(3);
&xor_stream($A2, $B2, $C2, $D2, "0*16");
&xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
sub \$8*16, $inl
lea 8*16($inp), $inp
lea 8*16($oup), $oup
jmp .Lopen_sse_tail_64_dec_loop
###############################################################################
.Lopen_sse_tail_256:\n";
# 193 - 255 bytes are left
&prep_state(4); $code.="
xor $itr2, $itr2
.Lopen_sse_tail_256_rounds_and_x1hash: \n";
&poly_add("0($inp,$itr2)");
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
&chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
&chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
&poly_stage1();
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
&poly_stage2();
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
&chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
&poly_stage3();
&chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
&poly_reduce_stage();
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
add \$16, $itr2
cmp \$10*16, $itr2
jb .Lopen_sse_tail_256_rounds_and_x1hash
mov $inl, $itr1
and \$-16, $itr1
.Lopen_sse_tail_256_hash: \n";
&poly_add("0($inp,$itr2)");
&poly_mul(); $code.="
add \$16, $itr2
cmp $itr1, $itr2
jb .Lopen_sse_tail_256_hash\n";
&finalize_state(4);
&xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
&xor_stream($A2, $B2, $C2, $D2, "4*16");
&xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
movdqa $tmp_store, $D0
sub \$12*16, $inl
lea 12*16($inp), $inp
lea 12*16($oup), $oup
###############################################################################
# Decrypt the remaining data, 16B at a time, using existing stream
.Lopen_sse_tail_64_dec_loop:
cmp \$16, $inl
jb .Lopen_sse_tail_16_init
sub \$16, $inl
movdqu ($inp), $T0
pxor $T0, $A0
movdqu $A0, ($oup)
lea 16($inp), $inp
lea 16($oup), $oup
movdqa $B0, $A0
movdqa $C0, $B0
movdqa $D0, $C0
jmp .Lopen_sse_tail_64_dec_loop
.Lopen_sse_tail_16_init:
movdqa $A0, $A1
# Decrypt up to 16 bytes at the end.
.Lopen_sse_tail_16:
test $inl, $inl
jz .Lopen_sse_finalize
# Read the final bytes into $T0. They need to be read in reverse order so
# that they end up in the correct order in $T0.
pxor $T0, $T0
lea -1($inp,$inl), $inp
movq $inl, $itr2
.Lopen_sse_tail_16_compose:
pslldq \$1, $T0
pinsrb \$0, ($inp), $T0
sub \$1, $inp
sub \$1, $itr2
jnz .Lopen_sse_tail_16_compose
movq $T0, $t0
pextrq \$1, $T0, $t1
# The final bytes of keystream are in $A1.
pxor $A1, $T0
# Copy the plaintext bytes out.
.Lopen_sse_tail_16_extract:
pextrb \$0, $T0, ($oup)
psrldq \$1, $T0
add \$1, $oup
sub \$1, $inl
jne .Lopen_sse_tail_16_extract
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Lopen_sse_finalize:\n";
&poly_add($len_store);
&poly_mul(); $code.="
# Final reduce
mov $acc0, $t0
mov $acc1, $t1
mov $acc2, $t2
sub \$-5, $acc0
sbb \$-1, $acc1
sbb \$3, $acc2
cmovc $t0, $acc0
cmovc $t1, $acc1
cmovc $t2, $acc2
# Add in s part of the key
add 0+$s_store, $acc0
adc 8+$s_store, $acc1\n";
$code.="
movaps 16*0+$xmm_store, %xmm6
movaps 16*1+$xmm_store, %xmm7
movaps 16*2+$xmm_store, %xmm8
movaps 16*3+$xmm_store, %xmm9
movaps 16*4+$xmm_store, %xmm10
movaps 16*5+$xmm_store, %xmm11
movaps 16*6+$xmm_store, %xmm12
movaps 16*7+$xmm_store, %xmm13
movaps 16*8+$xmm_store, %xmm14
movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
# The tag replaces the key on return
pop $keyp
.cfi_pop $keyp
mov $acc0, ($keyp)
mov $acc1, 8($keyp)
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbx
.cfi_pop %rbx
pop %rbp
.cfi_pop %rbp
ret
###############################################################################
.Lopen_sse_128:
.cfi_restore_state
movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
movdqu 2*16($keyp), $D0
movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
mov \$10, $acc0
.Lopen_sse_128_rounds: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jnz .Lopen_sse_128_rounds
paddd .Lchacha20_consts(%rip), $A0
paddd .Lchacha20_consts(%rip), $A1
paddd .Lchacha20_consts(%rip), $A2
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
paddd $T2, $C1\npaddd $T2, $C2
paddd $T3, $D1
paddd .Lsse_inc(%rip), $T3
paddd $T3, $D2
# Clamp and store the key
pand .Lclamp(%rip), $A0
movdqa $A0, $r_store
movdqa $B0, $s_store
# Hash
mov $adl, $itr2
call poly_hash_ad_internal
.Lopen_sse_128_xor_hash:
cmp \$16, $inl
jb .Lopen_sse_tail_16
sub \$16, $inl\n";
# Load for hashing
&poly_add("0*8($inp)"); $code.="
# Load for decryption
movdqu 0*16($inp), $T0
pxor $T0, $A1
movdqu $A1, 0*16($oup)
lea 1*16($inp), $inp
lea 1*16($oup), $oup\n";
&poly_mul(); $code.="
# Shift the stream left
movdqa $B1, $A1
movdqa $C1, $B1
movdqa $D1, $C1
movdqa $A2, $D1
movdqa $B2, $A2
movdqa $C2, $B2
movdqa $D2, $C2
jmp .Lopen_sse_128_xor_hash
.size GFp_chacha20_poly1305_open, .-GFp_chacha20_poly1305_open
.cfi_endproc
################################################################################
# void GFp_chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
# size_t plaintext_len, const uint8_t *ad,
# size_t ad_len,
# union chacha20_poly1305_seal_data *data);
.globl GFp_chacha20_poly1305_seal
.type GFp_chacha20_poly1305_seal,\@function,6
.align 64
GFp_chacha20_poly1305_seal:
.cfi_startproc
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
# We write the calculated authenticator back to keyp at the end, so save
# the pointer on the stack too.
push $keyp
.cfi_push $keyp
sub \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
lea 32(%rsp), %rbp
and \$-32, %rbp\n";
$code.="
movaps %xmm6,16*0+$xmm_store
movaps %xmm7,16*1+$xmm_store
movaps %xmm8,16*2+$xmm_store
movaps %xmm9,16*3+$xmm_store
movaps %xmm10,16*4+$xmm_store
movaps %xmm11,16*5+$xmm_store
movaps %xmm12,16*6+$xmm_store
movaps %xmm13,16*7+$xmm_store
movaps %xmm14,16*8+$xmm_store
movaps %xmm15,16*9+$xmm_store\n" if ($win64);
$code.="
mov 56($keyp), $inl # extra_in_len
addq %rdx, $inl
mov $adl, 0+$len_store
mov $inl, 8+$len_store
mov %rdx, $inl\n";
$code.="
mov GFp_ia32cap_P+8(%rip), %eax
and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
xor \$`(1<<5) + (1<<8)`, %eax
jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
$code.="
cmp \$128, $inl
jbe .Lseal_sse_128
# For longer buffers, prepare the poly key + some stream
movdqa .Lchacha20_consts(%rip), $A0
movdqu 0*16($keyp), $B0
movdqu 1*16($keyp), $C0
movdqu 2*16($keyp), $D0
movdqa $A0, $A1
movdqa $A0, $A2
movdqa $A0, $A3
movdqa $B0, $B1
movdqa $B0, $B2
movdqa $B0, $B3
movdqa $C0, $C1
movdqa $C0, $C2
movdqa $C0, $C3
movdqa $D0, $D3
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $D2
paddd .Lsse_inc(%rip), $D0
movdqa $D0, $D1
paddd .Lsse_inc(%rip), $D0
# Store on stack
movdqa $B0, $state1_store
movdqa $C0, $state2_store
movdqa $D0, $ctr0_store
movdqa $D1, $ctr1_store
movdqa $D2, $ctr2_store
movdqa $D3, $ctr3_store
mov \$10, $acc0
.Lseal_sse_init_rounds: \n";
foreach $l (@loop_body) {$code.=$l."\n";}