sub $in0,$end0()

in crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl [535:793]


	sub		$in0,$end0
	jc		.Ldec_no_key_aliasing
	cmp		\$768,$end0
	jnc		.Ldec_no_key_aliasing
	sub		$end0,%rsp		# avoid aliasing with key
.Ldec_no_key_aliasing:

	vmovdqu		0x50($inp),$Z3		# I[5]
	mov		$inp,$in0
	vmovdqu		0x40($inp),$Z0

	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
	# bytes before the end of the input. Note, in particular, that this is
	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
	# not be near the very beginning of the address space when |$len| < 2*96
	# (0xc0).
	lea		-0xc0($inp,$len),$end0

	vmovdqu		0x30($inp),$Z1
	shr		\$4,$len
	xor		%rax,%rax
	vmovdqu		0x20($inp),$Z2
	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
	vmovdqu		0x10($inp),$T2
	 vpshufb	$Ii,$Z0,$Z0
	vmovdqu		($inp),$Hkey
	 vpshufb	$Ii,$Z1,$Z1
	vmovdqu		$Z0,0x30(%rsp)
	 vpshufb	$Ii,$Z2,$Z2
	vmovdqu		$Z1,0x40(%rsp)
	 vpshufb	$Ii,$T2,$T2
	vmovdqu		$Z2,0x50(%rsp)
	 vpshufb	$Ii,$Hkey,$Hkey
	vmovdqu		$T2,0x60(%rsp)
	vmovdqu		$Hkey,0x70(%rsp)

	call		_aesni_ctr32_ghash_6x

	mov		$Xip_offset(%rbp), %r12
	vmovups		$inout0,-0x60($out)	# save output
	vmovups		$inout1,-0x50($out)
	vmovups		$inout2,-0x40($out)
	vmovups		$inout3,-0x30($out)
	vmovups		$inout4,-0x20($out)
	vmovups		$inout5,-0x10($out)

	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
	vmovdqu		$Xi,(%r12)		# output Xi

	vzeroupper
___
$code.=<<___ if ($win64);
	movaps	-0xd0(%rbp),%xmm6
	movaps	-0xc0(%rbp),%xmm7
	movaps	-0xb0(%rbp),%xmm8
	movaps	-0xa0(%rbp),%xmm9
	movaps	-0x90(%rbp),%xmm10
	movaps	-0x80(%rbp),%xmm11
	movaps	-0x70(%rbp),%xmm12
	movaps	-0x60(%rbp),%xmm13
	movaps	-0x50(%rbp),%xmm14
	movaps	-0x40(%rbp),%xmm15
	mov	0x10(%rbp),%rdi
	mov	0x18(%rbp),%rsi
___
$code.=<<___;
	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
.cfi_def_cfa	%rsp, 0x38
	pop	%r15
.cfi_pop	%r15
	pop	%r14
.cfi_pop	%r14
	pop	%r13
.cfi_pop	%r13
	pop	%r12
.cfi_pop	%r12
	pop	%rbx
.cfi_pop	%rbx
	pop	%rbp
.cfi_pop	%rbp
.Lgcm_dec_abort:
	ret
.seh_endproc
.cfi_endproc
.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
___

$code.=<<___;
.type	_aesni_ctr32_6x,\@abi-omnipotent
.align	32
_aesni_ctr32_6x:
.cfi_startproc
	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
	lea		-1($rounds),%r13
	vmovups		0x10-0x80($key),$rndkey
	lea		0x20-0x80($key),%r12
	vpxor		$Z0,$T1,$inout0
	add		\$`6<<24`,$counter
	jc		.Lhandle_ctr32_2
	vpaddb		$T2,$T1,$inout1
	vpaddb		$T2,$inout1,$inout2
	vpxor		$Z0,$inout1,$inout1
	vpaddb		$T2,$inout2,$inout3
	vpxor		$Z0,$inout2,$inout2
	vpaddb		$T2,$inout3,$inout4
	vpxor		$Z0,$inout3,$inout3
	vpaddb		$T2,$inout4,$inout5
	vpxor		$Z0,$inout4,$inout4
	vpaddb		$T2,$inout5,$T1
	vpxor		$Z0,$inout5,$inout5
	jmp		.Loop_ctr32

.align	16
.Loop_ctr32:
	vaesenc		$rndkey,$inout0,$inout0
	vaesenc		$rndkey,$inout1,$inout1
	vaesenc		$rndkey,$inout2,$inout2
	vaesenc		$rndkey,$inout3,$inout3
	vaesenc		$rndkey,$inout4,$inout4
	vaesenc		$rndkey,$inout5,$inout5
	vmovups		(%r12),$rndkey
	lea		0x10(%r12),%r12
	dec		%r13d
	jnz		.Loop_ctr32

	vmovdqu		(%r12),$Hkey		# last round key
	vaesenc		$rndkey,$inout0,$inout0
	vpxor		0x00($inp),$Hkey,$Z0
	vaesenc		$rndkey,$inout1,$inout1
	vpxor		0x10($inp),$Hkey,$Z1
	vaesenc		$rndkey,$inout2,$inout2
	vpxor		0x20($inp),$Hkey,$Z2
	vaesenc		$rndkey,$inout3,$inout3
	vpxor		0x30($inp),$Hkey,$Xi
	vaesenc		$rndkey,$inout4,$inout4
	vpxor		0x40($inp),$Hkey,$T2
	vaesenc		$rndkey,$inout5,$inout5
	vpxor		0x50($inp),$Hkey,$Hkey
	lea		0x60($inp),$inp

	vaesenclast	$Z0,$inout0,$inout0
	vaesenclast	$Z1,$inout1,$inout1
	vaesenclast	$Z2,$inout2,$inout2
	vaesenclast	$Xi,$inout3,$inout3
	vaesenclast	$T2,$inout4,$inout4
	vaesenclast	$Hkey,$inout5,$inout5
	vmovups		$inout0,0x00($out)
	vmovups		$inout1,0x10($out)
	vmovups		$inout2,0x20($out)
	vmovups		$inout3,0x30($out)
	vmovups		$inout4,0x40($out)
	vmovups		$inout5,0x50($out)
	lea		0x60($out),$out

	ret
.align	32
.Lhandle_ctr32_2:
	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
	vpaddd		$Z1,$Z2,$inout2
	vpaddd		$Z1,$inout1,$inout3
	vpshufb		$Ii,$inout1,$inout1
	vpaddd		$Z1,$inout2,$inout4
	vpshufb		$Ii,$inout2,$inout2
	vpxor		$Z0,$inout1,$inout1
	vpaddd		$Z1,$inout3,$inout5
	vpshufb		$Ii,$inout3,$inout3
	vpxor		$Z0,$inout2,$inout2
	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
	vpshufb		$Ii,$inout4,$inout4
	vpxor		$Z0,$inout3,$inout3
	vpshufb		$Ii,$inout5,$inout5
	vpxor		$Z0,$inout4,$inout4
	vpshufb		$Ii,$T1,$T1		# next counter value
	vpxor		$Z0,$inout5,$inout5
	jmp	.Loop_ctr32
.cfi_endproc
.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x

.globl	aesni_gcm_encrypt
.type	aesni_gcm_encrypt,\@abi-omnipotent
.align	32
aesni_gcm_encrypt:
.cfi_startproc
.seh_startproc
	_CET_ENDBR
#ifdef BORINGSSL_DISPATCH_TEST
.extern	BORINGSSL_function_hit
	movb \$1,BORINGSSL_function_hit+2(%rip)
#endif
	xor	%rax,%rax

	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
	# least 96 more bytes of input.
	cmp	\$0x60*3,$len			# minimal accepted length
	jb	.Lgcm_enc_abort

	push	%rbp
.cfi_push	%rbp
.seh_pushreg	%rbp
	mov	%rsp, %rbp			# save stack pointer
.cfi_def_cfa_register	%rbp
	push	%rbx
.cfi_push	%rbx
.seh_pushreg	%rbx
	push	%r12
.cfi_push	%r12
.seh_pushreg	%r12
	push	%r13
.cfi_push	%r13
.seh_pushreg	%r13
	push	%r14
.cfi_push	%r14
.seh_pushreg	%r14
	push	%r15
.cfi_push	%r15
.seh_pushreg	%r15
___
if ($win64) {
$code.=<<___
	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
.seh_allocstack	0xa8
.seh_setframe	%rbp, 0xa8+5*8
	# Load the last two parameters. These go into %rdi and %rsi, which are
	# non-volatile on Windows, so stash them in the parameter stack area
	# first.
	mov	%rdi, 0x10(%rbp)
.seh_savereg	%rdi, 0xa8+5*8+0x10
	mov	%rsi, 0x18(%rbp)
.seh_savereg	%rsi, 0xa8+5*8+0x18
	mov	0x30(%rbp), $ivp
	mov	0x38(%rbp), $Htable
	# Save non-volatile XMM registers.
	movaps	%xmm6,-0xd0(%rbp)
.seh_savexmm128	%xmm6, 0xa8+5*8-0xd0
	movaps	%xmm7,-0xc0(%rbp)
.seh_savexmm128	%xmm7, 0xa8+5*8-0xc0
	movaps	%xmm8,-0xb0(%rbp)
.seh_savexmm128	%xmm8, 0xa8+5*8-0xb0
	movaps	%xmm9,-0xa0(%rbp)
.seh_savexmm128	%xmm9, 0xa8+5*8-0xa0
	movaps	%xmm10,-0x90(%rbp)
.seh_savexmm128	%xmm10, 0xa8+5*8-0x90
	movaps	%xmm11,-0x80(%rbp)
.seh_savexmm128	%xmm11, 0xa8+5*8-0x80
	movaps	%xmm12,-0x70(%rbp)
.seh_savexmm128	%xmm12, 0xa8+5*8-0x70
	movaps	%xmm13,-0x60(%rbp)
.seh_savexmm128	%xmm13, 0xa8+5*8-0x60
	movaps	%xmm14,-0x50(%rbp)
.seh_savexmm128	%xmm14, 0xa8+5*8-0x50
	movaps	%xmm15,-0x40(%rbp)
.seh_savexmm128	%xmm15, 0xa8+5*8-0x40
___
}