sub \$0x80,$len()

in crypto/fipsmodule/modes/asm/ghash-x86_64.pl [915:1296]


	sub		\$0x80,$len

	vmovdqu		0x70($inp),$Ii		# I[7]
	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
	vpshufb		$bswap,$Ii,$Ii
	vmovdqu		0x20-0x40($Htbl),$HK

	vpunpckhqdq	$Ii,$Ii,$T2
	 vmovdqu	0x60($inp),$Ij		# I[6]
	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	vpxor		$Ii,$T2,$T2
	 vpshufb	$bswap,$Ij,$Ij
	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
	 vpunpckhqdq	$Ij,$Ij,$T1
	 vmovdqu	0x50($inp),$Ii		# I[5]
	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
	 vpxor		$Ij,$T1,$T1

	 vpshufb	$bswap,$Ii,$Ii
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	 vpunpckhqdq	$Ii,$Ii,$T2
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
	 vpxor		$Ii,$T2,$T2
	 vmovdqu	0x40($inp),$Ij		# I[4]
	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
	 vmovdqu	0x50-0x40($Htbl),$HK

	 vpshufb	$bswap,$Ij,$Ij
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	vpxor		$Xhi,$Zhi,$Zhi
	 vpunpckhqdq	$Ij,$Ij,$T1
	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
	 vpxor		$Ij,$T1,$T1

	 vmovdqu	0x30($inp),$Ii		# I[3]
	vpxor		$Zlo,$Xlo,$Xlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	vpxor		$Zhi,$Xhi,$Xhi
	 vpshufb	$bswap,$Ii,$Ii
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
	vpxor		$Zmi,$Xmi,$Xmi
	 vpunpckhqdq	$Ii,$Ii,$T2
	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
	 vmovdqu	0x80-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2

	 vmovdqu	0x20($inp),$Ij		# I[2]
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	vpxor		$Xhi,$Zhi,$Zhi
	 vpshufb	$bswap,$Ij,$Ij
	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
	vpxor		$Xmi,$Zmi,$Zmi
	 vpunpckhqdq	$Ij,$Ij,$T1
	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
	 vpxor		$Ij,$T1,$T1

	 vmovdqu	0x10($inp),$Ii		# I[1]
	vpxor		$Zlo,$Xlo,$Xlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	vpxor		$Zhi,$Xhi,$Xhi
	 vpshufb	$bswap,$Ii,$Ii
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
	vpxor		$Zmi,$Xmi,$Xmi
	 vpunpckhqdq	$Ii,$Ii,$T2
	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
	 vmovdqu	0xb0-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2

	 vmovdqu	($inp),$Ij		# I[0]
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	vpxor		$Xhi,$Zhi,$Zhi
	 vpshufb	$bswap,$Ij,$Ij
	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x10,$HK,$T2,$Xmi

	lea		0x80($inp),$inp
	cmp		\$0x80,$len
	jb		.Ltail_avx

	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
	sub		\$0x80,$len
	jmp		.Loop8x_avx

.align	32
.Loop8x_avx:
	vpunpckhqdq	$Ij,$Ij,$T1
	 vmovdqu	0x70($inp),$Ii		# I[7]
	vpxor		$Xlo,$Zlo,$Zlo
	vpxor		$Ij,$T1,$T1
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
	 vpshufb	$bswap,$Ii,$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
	 vpunpckhqdq	$Ii,$Ii,$T2
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Tred
	 vmovdqu	0x20-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2

	  vmovdqu	0x60($inp),$Ij		# I[6]
	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	vpxor		$Zlo,$Xi,$Xi		# collect result
	  vpshufb	$bswap,$Ij,$Ij
	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	vxorps		$Zhi,$Xo,$Xo
	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
	 vpunpckhqdq	$Ij,$Ij,$T1
	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
	vpxor		$Zmi,$Tred,$Tred
	 vxorps		$Ij,$T1,$T1

	  vmovdqu	0x50($inp),$Ii		# I[5]
	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	vpxor		$Xo,$Tred,$Tred
	vpslldq		\$8,$Tred,$T2
	 vpxor		$Xlo,$Zlo,$Zlo
	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	vpsrldq		\$8,$Tred,$Tred
	vpxor		$T2, $Xi, $Xi
	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
	  vpshufb	$bswap,$Ii,$Ii
	vxorps		$Tred,$Xo, $Xo
	 vpxor		$Xhi,$Zhi,$Zhi
	 vpunpckhqdq	$Ii,$Ii,$T2
	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
	  vmovdqu	0x50-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2
	 vpxor		$Xmi,$Zmi,$Zmi

	  vmovdqu	0x40($inp),$Ij		# I[4]
	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	  vpshufb	$bswap,$Ij,$Ij
	 vpxor		$Zlo,$Xlo,$Xlo
	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
	 vpunpckhqdq	$Ij,$Ij,$T1
	 vpxor		$Zhi,$Xhi,$Xhi
	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
	 vxorps		$Ij,$T1,$T1
	 vpxor		$Zmi,$Xmi,$Xmi

	  vmovdqu	0x30($inp),$Ii		# I[3]
	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	  vpshufb	$bswap,$Ii,$Ii
	 vpxor		$Xlo,$Zlo,$Zlo
	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
	 vpunpckhqdq	$Ii,$Ii,$T2
	 vpxor		$Xhi,$Zhi,$Zhi
	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
	  vmovdqu	0x80-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2
	 vpxor		$Xmi,$Zmi,$Zmi

	  vmovdqu	0x20($inp),$Ij		# I[2]
	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	  vpshufb	$bswap,$Ij,$Ij
	 vpxor		$Zlo,$Xlo,$Xlo
	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
	 vpunpckhqdq	$Ij,$Ij,$T1
	 vpxor		$Zhi,$Xhi,$Xhi
	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
	 vpxor		$Ij,$T1,$T1
	 vpxor		$Zmi,$Xmi,$Xmi
	vxorps		$Tred,$Xi,$Xi

	  vmovdqu	0x10($inp),$Ii		# I[1]
	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
	  vpshufb	$bswap,$Ii,$Ii
	 vpxor		$Xlo,$Zlo,$Zlo
	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
	vxorps		$Xo,$Tred,$Tred
	 vpunpckhqdq	$Ii,$Ii,$T2
	 vpxor		$Xhi,$Zhi,$Zhi
	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
	  vmovdqu	0xb0-0x40($Htbl),$HK
	 vpxor		$Ii,$T2,$T2
	 vpxor		$Xmi,$Zmi,$Zmi

	  vmovdqu	($inp),$Ij		# I[0]
	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
	  vpshufb	$bswap,$Ij,$Ij
	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
	vpxor		$Tred,$Ij,$Ij
	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi

	lea		0x80($inp),$inp
	sub		\$0x80,$len
	jnc		.Loop8x_avx

	add		\$0x80,$len
	jmp		.Ltail_no_xor_avx

.align	32
.Lshort_avx:
	vmovdqu		-0x10($inp,$len),$Ii	# very last word
	lea		($inp,$len),$inp
	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
	vmovdqu		0x20-0x40($Htbl),$HK
	vpshufb		$bswap,$Ii,$Ij

	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
	vmovdqa		$Xhi,$Zhi		# $Zhi and
	vmovdqa		$Xmi,$Zmi		# $Zmi
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x20($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vpsrldq		\$8,$HK,$HK
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x30($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vmovdqu		0x50-0x40($Htbl),$HK
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x40($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vpsrldq		\$8,$HK,$HK
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x50($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vmovdqu		0x80-0x40($Htbl),$HK
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x60($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vpsrldq		\$8,$HK,$HK
	sub		\$0x10,$len
	jz		.Ltail_avx

	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	 vmovdqu	-0x70($inp),$Ii
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
	 vpshufb	$bswap,$Ii,$Ij
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
	vmovq		0xb8-0x40($Htbl),$HK
	sub		\$0x10,$len
	jmp		.Ltail_avx

.align	32
.Ltail_avx:
	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
.Ltail_no_xor_avx:
	vpunpckhqdq	$Ij,$Ij,$T1
	vpxor		$Xlo,$Zlo,$Zlo
	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
	vpxor		$Ij,$T1,$T1
	vpxor		$Xhi,$Zhi,$Zhi
	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
	vpxor		$Xmi,$Zmi,$Zmi
	vpclmulqdq	\$0x00,$HK,$T1,$Xmi

	vmovdqu		(%r10),$Tred

	vpxor		$Xlo,$Zlo,$Xi
	vpxor		$Xhi,$Zhi,$Xo
	vpxor		$Xmi,$Zmi,$Zmi

	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
	vpxor		$Xo, $Zmi,$Zmi
	vpslldq		\$8, $Zmi,$T2
	vpsrldq		\$8, $Zmi,$Zmi
	vpxor		$T2, $Xi, $Xi
	vpxor		$Zmi,$Xo, $Xo

	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
	vpalignr	\$8,$Xi,$Xi,$Xi
	vpxor		$T2,$Xi,$Xi

	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
	vpalignr	\$8,$Xi,$Xi,$Xi
	vpxor		$Xo,$Xi,$Xi
	vpxor		$T2,$Xi,$Xi

	cmp		\$0,$len
	jne		.Lshort_avx

	vpshufb		$bswap,$Xi,$Xi
	vmovdqu		$Xi,($Xip)
	vzeroupper
___
$code.=<<___ if ($win64);
	movaps	(%rsp),%xmm6
	movaps	0x10(%rsp),%xmm7
	movaps	0x20(%rsp),%xmm8
	movaps	0x30(%rsp),%xmm9
	movaps	0x40(%rsp),%xmm10
	movaps	0x50(%rsp),%xmm11
	movaps	0x60(%rsp),%xmm12
	movaps	0x70(%rsp),%xmm13
	movaps	0x80(%rsp),%xmm14
	movaps	0x90(%rsp),%xmm15
	lea	0xa8(%rsp),%rsp
___
$code.=<<___;
	ret
.cfi_endproc
.seh_endproc
.size	gcm_ghash_avx,.-gcm_ghash_avx
___
} else {