in ring/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl [1016:1373]
sub \$12*16, $inl
lea 12*16($inp), $inp
jmp .Lseal_sse_128_tail_hash
.Lseal_sse_main_loop_xor: \n";
&xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
lea 16*16($inp), $inp
sub \$16*16, $inl
mov \$6, $itr1
mov \$4, $itr2
cmp \$12*16, $inl
jg .Lseal_sse_main_loop
mov $inl, $itr1
test $inl, $inl
je .Lseal_sse_128_tail_hash
mov \$6, $itr1
cmp \$8*16, $inl
ja .Lseal_sse_tail_192
cmp \$4*16, $inl
ja .Lseal_sse_tail_128
###############################################################################
.Lseal_sse_tail_64: \n";
&prep_state(1); $code.="
.Lseal_sse_tail_64_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_64_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_64_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_64_rounds_and_x1hash\n";
&finalize_state(1); $code.="
jmp .Lseal_sse_128_tail_xor
###############################################################################
.Lseal_sse_tail_128:\n";
&prep_state(2); $code.="
.Lseal_sse_tail_128_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_128_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&poly_add("0($oup)");
&poly_mul();
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_128_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_128_rounds_and_x1hash\n";
&finalize_state(2);
&xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
mov \$4*16, $itr1
sub \$4*16, $inl
lea 4*16($inp), $inp
jmp .Lseal_sse_128_tail_hash
###############################################################################
.Lseal_sse_tail_192:\n";
&prep_state(3); $code.="
.Lseal_sse_tail_192_rounds_and_x2hash: \n";
&poly_add("0($oup)");
&poly_mul(); $code.="
lea 16($oup), $oup
.Lseal_sse_tail_192_rounds_and_x1hash: \n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&poly_add("0($oup)");
&poly_mul();
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
lea 16($oup), $oup
dec $itr1
jg .Lseal_sse_tail_192_rounds_and_x2hash
dec $itr2
jge .Lseal_sse_tail_192_rounds_and_x1hash\n";
&finalize_state(3);
&xor_stream($A2,$B2,$C2,$D2,0*16);
&xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
mov \$8*16, $itr1
sub \$8*16, $inl
lea 8*16($inp), $inp
###############################################################################
.Lseal_sse_128_tail_hash:
cmp \$16, $itr1
jb .Lseal_sse_128_tail_xor\n";
&poly_add("0($oup)");
&poly_mul(); $code.="
sub \$16, $itr1
lea 16($oup), $oup
jmp .Lseal_sse_128_tail_hash
.Lseal_sse_128_tail_xor:
cmp \$16, $inl
jb .Lseal_sse_tail_16
sub \$16, $inl
# Load for decryption
movdqu 0*16($inp), $T0
pxor $T0, $A0
movdqu $A0, 0*16($oup)
# Then hash
add 0*8($oup), $acc0
adc 1*8($oup), $acc1
adc \$1, $acc2
lea 1*16($inp), $inp
lea 1*16($oup), $oup\n";
&poly_mul(); $code.="
# Shift the stream left
movdqa $B0, $A0
movdqa $C0, $B0
movdqa $D0, $C0
movdqa $A1, $D0
movdqa $B1, $A1
movdqa $C1, $B1
movdqa $D1, $C1
jmp .Lseal_sse_128_tail_xor
.Lseal_sse_tail_16:
test $inl, $inl
jz .Lprocess_blocks_of_extra_in
# We can only load the PT one byte at a time to avoid buffer overread
mov $inl, $itr2
mov $inl, $itr1
lea -1($inp,$inl), $inp
pxor $T3, $T3
.Lseal_sse_tail_16_compose:
pslldq \$1, $T3
pinsrb \$0, ($inp), $T3
lea -1($inp), $inp
dec $itr1
jne .Lseal_sse_tail_16_compose
# XOR the keystream with the plaintext.
pxor $A0, $T3
# Write ciphertext out, byte-by-byte.
movq $inl, $itr1
movdqu $T3, $A0
.Lseal_sse_tail_16_extract:
pextrb \$0, $A0, ($oup)
psrldq \$1, $A0
add \$1, $oup
sub \$1, $itr1
jnz .Lseal_sse_tail_16_extract
# $T3 contains the final (partial, non-empty) block of ciphertext which
# needs to be fed into the Poly1305 state. The right-most $inl bytes of it
# are valid. We need to fill it with extra_in bytes until full, or until we
# run out of bytes.
#
# $keyp points to the tag output, which is actually a struct with the
# extra_in pointer and length at offset 48.
movq 288 + $xmm_storage + 32(%rsp), $keyp
movq 56($keyp), $t1 # extra_in_len
movq 48($keyp), $t0 # extra_in
test $t1, $t1
jz .Lprocess_partial_block # Common case: no bytes of extra_in
movq \$16, $t2
subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3.
cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len
# (note that AT&T syntax reverses the arguments)
jge .Lload_extra_in
movq $t1, $t2
.Lload_extra_in:
# $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
# into $T3. They are loaded in reverse order.
leaq -1($t0,$t2), $inp
# Update extra_in and extra_in_len to reflect the bytes that are about to
# be read.
addq $t2, $t0
subq $t2, $t1
movq $t0, 48($keyp)
movq $t1, 56($keyp)
# Update $itr2, which is used to select the mask later on, to reflect the
# extra bytes about to be added.
addq $t2, $itr2
# Load $t2 bytes of extra_in into $T2.
pxor $T2, $T2
.Lload_extra_load_loop:
pslldq \$1, $T2
pinsrb \$0, ($inp), $T2
lea -1($inp), $inp
sub \$1, $t2
jnz .Lload_extra_load_loop
# Shift $T2 up the length of the remainder from the main encryption. Sadly,
# the shift for an XMM register has to be a constant, thus we loop to do
# this.
movq $inl, $t2
.Lload_extra_shift_loop:
pslldq \$1, $T2
sub \$1, $t2
jnz .Lload_extra_shift_loop
# Mask $T3 (the remainder from the main encryption) so that superfluous
# bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
# disjoint and so we can merge them with an OR.
lea .Land_masks(%rip), $t2
shl \$4, $inl
pand -16($t2,$inl), $T3
# Merge $T2 into $T3, forming the remainder block.
por $T2, $T3
# The block of ciphertext + extra_in is ready to be included in the
# Poly1305 state.
movq $T3, $t0
pextrq \$1, $T3, $t1
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Lprocess_blocks_of_extra_in:
# There may be additional bytes of extra_in to process.
movq 288+32+$xmm_storage (%rsp), $keyp
movq 48($keyp), $inp # extra_in
movq 56($keyp), $itr2 # extra_in_len
movq $itr2, $itr1
shr \$4, $itr2 # number of blocks
.Lprocess_extra_hash_loop:
jz process_extra_in_trailer\n";
&poly_add("0($inp)");
&poly_mul(); $code.="
leaq 16($inp), $inp
subq \$1, $itr2
jmp .Lprocess_extra_hash_loop
process_extra_in_trailer:
andq \$15, $itr1 # remaining num bytes (<16) of extra_in
movq $itr1, $inl
jz .Ldo_length_block
leaq -1($inp,$itr1), $inp
.Lprocess_extra_in_trailer_load:
pslldq \$1, $T3
pinsrb \$0, ($inp), $T3
lea -1($inp), $inp
sub \$1, $itr1
jnz .Lprocess_extra_in_trailer_load
.Lprocess_partial_block:
# $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
lea .Land_masks(%rip), $t2
shl \$4, $inl
pand -16($t2,$inl), $T3
movq $T3, $t0
pextrq \$1, $T3, $t1
add $t0, $acc0
adc $t1, $acc1
adc \$1, $acc2\n";
&poly_mul(); $code.="
.Ldo_length_block:\n";
&poly_add($len_store);
&poly_mul(); $code.="
# Final reduce
mov $acc0, $t0
mov $acc1, $t1
mov $acc2, $t2
sub \$-5, $acc0
sbb \$-1, $acc1
sbb \$3, $acc2
cmovc $t0, $acc0
cmovc $t1, $acc1
cmovc $t2, $acc2
# Add in s part of the key
add 0+$s_store, $acc0
adc 8+$s_store, $acc1\n";
$code.="
movaps 16*0+$xmm_store, %xmm6
movaps 16*1+$xmm_store, %xmm7
movaps 16*2+$xmm_store, %xmm8
movaps 16*3+$xmm_store, %xmm9
movaps 16*4+$xmm_store, %xmm10
movaps 16*5+$xmm_store, %xmm11
movaps 16*6+$xmm_store, %xmm12
movaps 16*7+$xmm_store, %xmm13
movaps 16*8+$xmm_store, %xmm14
movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
$code.="
.cfi_remember_state
add \$288 + $xmm_storage + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
# The tag replaces the key on return
pop $keyp
.cfi_pop $keyp
mov $acc0, ($keyp)
mov $acc1, 8($keyp)
pop %r15
.cfi_pop %r15
pop %r14
.cfi_pop %r14
pop %r13
.cfi_pop %r13
pop %r12
.cfi_pop %r12
pop %rbx
.cfi_pop %rbx
pop %rbp
.cfi_pop %rbp
ret
################################################################################
.Lseal_sse_128:
.cfi_restore_state
movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
movdqu 2*16($keyp), $D2
movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0
movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
mov \$10, $acc0
.Lseal_sse_128_rounds:\n";
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
dec $acc0
jnz .Lseal_sse_128_rounds
paddd .Lchacha20_consts(%rip), $A0
paddd .Lchacha20_consts(%rip), $A1
paddd .Lchacha20_consts(%rip), $A2
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
paddd $T2, $C0\npaddd $T2, $C1
paddd $T3, $D0
paddd .Lsse_inc(%rip), $T3
paddd $T3, $D1
# Clamp and store the key
pand .Lclamp(%rip), $A2
movdqa $A2, $r_store
movdqa $B2, $s_store
# Hash
mov %r8, $itr2
call poly_hash_ad_internal
jmp .Lseal_sse_128_tail_xor
.size GFp_chacha20_poly1305_seal, .-GFp_chacha20_poly1305_seal
.cfi_endproc\n";
}
if ($avx>1) {