sub \$16*6,$len()

in src/crypto/fipsmodule/aes/asm/aesni-x86_64.pl [2334:2765]


	sub	\$16*6,$len
	jc	.Lxts_dec_short			# if $len-=6*16 borrowed

	mov	\$16+96,$rounds
	lea	32($key_,$rnds_),$key		# end of key schedule
	sub	%r10,%rax			# twisted $rounds
	$movkey	16($key_),$rndkey1
	mov	%rax,%r10			# backup twisted $rounds
	lea	.Lxts_magic(%rip),%r8
	jmp	.Lxts_dec_grandloop

.align	32
.Lxts_dec_grandloop:
	movdqu	`16*0`($inp),$inout0		# load input
	movdqa	$rndkey0,$twmask
	movdqu	`16*1`($inp),$inout1
	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
	movdqu	`16*2`($inp),$inout2
	pxor	@tweak[1],$inout1
	 aesdec		$rndkey1,$inout0
	movdqu	`16*3`($inp),$inout3
	pxor	@tweak[2],$inout2
	 aesdec		$rndkey1,$inout1
	movdqu	`16*4`($inp),$inout4
	pxor	@tweak[3],$inout3
	 aesdec		$rndkey1,$inout2
	movdqu	`16*5`($inp),$inout5
	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
	pxor	@tweak[4],$inout4
	 aesdec		$rndkey1,$inout3
	$movkey	32($key_),$rndkey0
	lea	`16*6`($inp),$inp
	pxor	$twmask,$inout5

	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
	aesdec		$rndkey1,$inout4
	 pxor	$twres,@tweak[1]
	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
	aesdec		$rndkey1,$inout5
	$movkey		48($key_),$rndkey1
	 pxor	$twres,@tweak[2]

	aesdec		$rndkey0,$inout0
	 pxor	$twres,@tweak[3]
	 movdqa	@tweak[1],`16*1`(%rsp)
	aesdec		$rndkey0,$inout1
	 pxor	$twres,@tweak[4]
	 movdqa	@tweak[2],`16*2`(%rsp)
	aesdec		$rndkey0,$inout2
	aesdec		$rndkey0,$inout3
	 pxor	$twres,$twmask
	 movdqa	@tweak[4],`16*4`(%rsp)
	aesdec		$rndkey0,$inout4
	aesdec		$rndkey0,$inout5
	$movkey		64($key_),$rndkey0
	 movdqa	$twmask,`16*5`(%rsp)
	pshufd	\$0x5f,@tweak[5],$twres
	jmp	.Lxts_dec_loop6
.align	32
.Lxts_dec_loop6:
	aesdec		$rndkey1,$inout0
	aesdec		$rndkey1,$inout1
	aesdec		$rndkey1,$inout2
	aesdec		$rndkey1,$inout3
	aesdec		$rndkey1,$inout4
	aesdec		$rndkey1,$inout5
	$movkey		-64($key,%rax),$rndkey1
	add		\$32,%rax

	aesdec		$rndkey0,$inout0
	aesdec		$rndkey0,$inout1
	aesdec		$rndkey0,$inout2
	aesdec		$rndkey0,$inout3
	aesdec		$rndkey0,$inout4
	aesdec		$rndkey0,$inout5
	$movkey		-80($key,%rax),$rndkey0
	jnz		.Lxts_dec_loop6

	movdqa	(%r8),$twmask			# start calculating next tweak
	movdqa	$twres,$twtmp
	paddd	$twres,$twres
	 aesdec		$rndkey1,$inout0
	paddq	@tweak[5],@tweak[5]
	psrad	\$31,$twtmp
	 aesdec		$rndkey1,$inout1
	pand	$twmask,$twtmp
	$movkey	($key_),@tweak[0]		# load round[0]
	 aesdec		$rndkey1,$inout2
	 aesdec		$rndkey1,$inout3
	 aesdec		$rndkey1,$inout4
	pxor	$twtmp,@tweak[5]
	movaps	@tweak[0],@tweak[1]		# copy round[0]
	 aesdec		$rndkey1,$inout5
	 $movkey	-64($key),$rndkey1

	movdqa	$twres,$twtmp
	 aesdec		$rndkey0,$inout0
	paddd	$twres,$twres
	pxor	@tweak[5],@tweak[0]
	 aesdec		$rndkey0,$inout1
	psrad	\$31,$twtmp
	paddq	@tweak[5],@tweak[5]
	 aesdec		$rndkey0,$inout2
	 aesdec		$rndkey0,$inout3
	pand	$twmask,$twtmp
	movaps	@tweak[1],@tweak[2]
	 aesdec		$rndkey0,$inout4
	pxor	$twtmp,@tweak[5]
	movdqa	$twres,$twtmp
	 aesdec		$rndkey0,$inout5
	 $movkey	-48($key),$rndkey0

	paddd	$twres,$twres
	 aesdec		$rndkey1,$inout0
	pxor	@tweak[5],@tweak[1]
	psrad	\$31,$twtmp
	 aesdec		$rndkey1,$inout1
	paddq	@tweak[5],@tweak[5]
	pand	$twmask,$twtmp
	 aesdec		$rndkey1,$inout2
	 aesdec		$rndkey1,$inout3
	 movdqa	@tweak[3],`16*3`(%rsp)
	pxor	$twtmp,@tweak[5]
	 aesdec		$rndkey1,$inout4
	movaps	@tweak[2],@tweak[3]
	movdqa	$twres,$twtmp
	 aesdec		$rndkey1,$inout5
	 $movkey	-32($key),$rndkey1

	paddd	$twres,$twres
	 aesdec		$rndkey0,$inout0
	pxor	@tweak[5],@tweak[2]
	psrad	\$31,$twtmp
	 aesdec		$rndkey0,$inout1
	paddq	@tweak[5],@tweak[5]
	pand	$twmask,$twtmp
	 aesdec		$rndkey0,$inout2
	 aesdec		$rndkey0,$inout3
	 aesdec		$rndkey0,$inout4
	pxor	$twtmp,@tweak[5]
	movaps	@tweak[3],@tweak[4]
	 aesdec		$rndkey0,$inout5

	movdqa	$twres,$rndkey0
	paddd	$twres,$twres
	 aesdec		$rndkey1,$inout0
	pxor	@tweak[5],@tweak[3]
	psrad	\$31,$rndkey0
	 aesdec		$rndkey1,$inout1
	paddq	@tweak[5],@tweak[5]
	pand	$twmask,$rndkey0
	 aesdec		$rndkey1,$inout2
	 aesdec		$rndkey1,$inout3
	pxor	$rndkey0,@tweak[5]
	$movkey		($key_),$rndkey0
	 aesdec		$rndkey1,$inout4
	 aesdec		$rndkey1,$inout5
	$movkey		16($key_),$rndkey1

	pxor	@tweak[5],@tweak[4]
	 aesdeclast	`16*0`(%rsp),$inout0
	psrad	\$31,$twres
	paddq	@tweak[5],@tweak[5]
	 aesdeclast	`16*1`(%rsp),$inout1
	 aesdeclast	`16*2`(%rsp),$inout2
	pand	$twmask,$twres
	mov	%r10,%rax			# restore $rounds
	 aesdeclast	`16*3`(%rsp),$inout3
	 aesdeclast	`16*4`(%rsp),$inout4
	 aesdeclast	`16*5`(%rsp),$inout5
	pxor	$twres,@tweak[5]

	lea	`16*6`($out),$out		# $out+=6*16
	movups	$inout0,`-16*6`($out)		# store 6 output blocks
	movups	$inout1,`-16*5`($out)
	movups	$inout2,`-16*4`($out)
	movups	$inout3,`-16*3`($out)
	movups	$inout4,`-16*2`($out)
	movups	$inout5,`-16*1`($out)
	sub	\$16*6,$len
	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow

	mov	\$16+96,$rounds
	sub	$rnds_,$rounds
	mov	$key_,$key			# restore $key
	shr	\$4,$rounds			# restore original value

.Lxts_dec_short:
	# at the point @tweak[0..5] are populated with tweak values
	mov	$rounds,$rnds_			# backup $rounds
	pxor	$rndkey0,@tweak[0]
	pxor	$rndkey0,@tweak[1]
	add	\$16*6,$len			# restore real remaining $len
	jz	.Lxts_dec_done			# done if ($len==0)

	pxor	$rndkey0,@tweak[2]
	cmp	\$0x20,$len
	jb	.Lxts_dec_one			# $len is 1*16
	pxor	$rndkey0,@tweak[3]
	je	.Lxts_dec_two			# $len is 2*16

	pxor	$rndkey0,@tweak[4]
	cmp	\$0x40,$len
	jb	.Lxts_dec_three			# $len is 3*16
	je	.Lxts_dec_four			# $len is 4*16

	movdqu	($inp),$inout0			# $len is 5*16
	movdqu	16*1($inp),$inout1
	movdqu	16*2($inp),$inout2
	pxor	@tweak[0],$inout0
	movdqu	16*3($inp),$inout3
	pxor	@tweak[1],$inout1
	movdqu	16*4($inp),$inout4
	lea	16*5($inp),$inp			# $inp+=5*16
	pxor	@tweak[2],$inout2
	pxor	@tweak[3],$inout3
	pxor	@tweak[4],$inout4

	call	_aesni_decrypt6

	xorps	@tweak[0],$inout0
	xorps	@tweak[1],$inout1
	xorps	@tweak[2],$inout2
	movdqu	$inout0,($out)			# store 5 output blocks
	xorps	@tweak[3],$inout3
	movdqu	$inout1,16*1($out)
	xorps	@tweak[4],$inout4
	movdqu	$inout2,16*2($out)
	 pxor		$twtmp,$twtmp
	movdqu	$inout3,16*3($out)
	 pcmpgtd	@tweak[5],$twtmp
	movdqu	$inout4,16*4($out)
	lea	16*5($out),$out			# $out+=5*16
	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
	and	\$15,$len_
	jz	.Lxts_dec_ret

	movdqa	@tweak[5],@tweak[0]
	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
	pand	$twmask,@tweak[1]		# isolate carry and residue
	pxor	@tweak[5],@tweak[1]
	jmp	.Lxts_dec_done2

.align	16
.Lxts_dec_one:
	movups	($inp),$inout0
	lea	16*1($inp),$inp			# $inp+=1*16
	xorps	@tweak[0],$inout0
___
	&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
	xorps	@tweak[0],$inout0
	movdqa	@tweak[1],@tweak[0]
	movups	$inout0,($out)			# store one output block
	movdqa	@tweak[2],@tweak[1]
	lea	16*1($out),$out			# $out+=1*16
	jmp	.Lxts_dec_done

.align	16
.Lxts_dec_two:
	movups	($inp),$inout0
	movups	16($inp),$inout1
	lea	32($inp),$inp			# $inp+=2*16
	xorps	@tweak[0],$inout0
	xorps	@tweak[1],$inout1

	call	_aesni_decrypt2

	xorps	@tweak[0],$inout0
	movdqa	@tweak[2],@tweak[0]
	xorps	@tweak[1],$inout1
	movdqa	@tweak[3],@tweak[1]
	movups	$inout0,($out)			# store 2 output blocks
	movups	$inout1,16*1($out)
	lea	16*2($out),$out			# $out+=2*16
	jmp	.Lxts_dec_done

.align	16
.Lxts_dec_three:
	movups	($inp),$inout0
	movups	16*1($inp),$inout1
	movups	16*2($inp),$inout2
	lea	16*3($inp),$inp			# $inp+=3*16
	xorps	@tweak[0],$inout0
	xorps	@tweak[1],$inout1
	xorps	@tweak[2],$inout2

	call	_aesni_decrypt3

	xorps	@tweak[0],$inout0
	movdqa	@tweak[3],@tweak[0]
	xorps	@tweak[1],$inout1
	movdqa	@tweak[4],@tweak[1]
	xorps	@tweak[2],$inout2
	movups	$inout0,($out)			# store 3 output blocks
	movups	$inout1,16*1($out)
	movups	$inout2,16*2($out)
	lea	16*3($out),$out			# $out+=3*16
	jmp	.Lxts_dec_done

.align	16
.Lxts_dec_four:
	movups	($inp),$inout0
	movups	16*1($inp),$inout1
	movups	16*2($inp),$inout2
	xorps	@tweak[0],$inout0
	movups	16*3($inp),$inout3
	lea	16*4($inp),$inp			# $inp+=4*16
	xorps	@tweak[1],$inout1
	xorps	@tweak[2],$inout2
	xorps	@tweak[3],$inout3

	call	_aesni_decrypt4

	pxor	@tweak[0],$inout0
	movdqa	@tweak[4],@tweak[0]
	pxor	@tweak[1],$inout1
	movdqa	@tweak[5],@tweak[1]
	pxor	@tweak[2],$inout2
	movdqu	$inout0,($out)			# store 4 output blocks
	pxor	@tweak[3],$inout3
	movdqu	$inout1,16*1($out)
	movdqu	$inout2,16*2($out)
	movdqu	$inout3,16*3($out)
	lea	16*4($out),$out			# $out+=4*16
	jmp	.Lxts_dec_done

.align	16
.Lxts_dec_done:
	and	\$15,$len_			# see if $len%16 is 0
	jz	.Lxts_dec_ret
.Lxts_dec_done2:
	mov	$len_,$len
	mov	$key_,$key			# restore $key
	mov	$rnds_,$rounds			# restore $rounds

	movups	($inp),$inout0
	xorps	@tweak[1],$inout0
___
	&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
	xorps	@tweak[1],$inout0
	movups	$inout0,($out)

.Lxts_dec_steal:
	movzb	16($inp),%eax			# borrow $rounds ...
	movzb	($out),%ecx			# ... and $key
	lea	1($inp),$inp
	mov	%al,($out)
	mov	%cl,16($out)
	lea	1($out),$out
	sub	\$1,$len
	jnz	.Lxts_dec_steal

	sub	$len_,$out			# rewind $out
	mov	$key_,$key			# restore $key
	mov	$rnds_,$rounds			# restore $rounds

	movups	($out),$inout0
	xorps	@tweak[0],$inout0
___
	&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
	xorps	@tweak[0],$inout0
	movups	$inout0,($out)

.Lxts_dec_ret:
	xorps	%xmm0,%xmm0			# clear register bank
	pxor	%xmm1,%xmm1
	pxor	%xmm2,%xmm2
	pxor	%xmm3,%xmm3
	pxor	%xmm4,%xmm4
	pxor	%xmm5,%xmm5
___
$code.=<<___ if (!$win64);
	pxor	%xmm6,%xmm6
	pxor	%xmm7,%xmm7
	movaps	%xmm0,0x00(%rsp)		# clear stack
	pxor	%xmm8,%xmm8
	movaps	%xmm0,0x10(%rsp)
	pxor	%xmm9,%xmm9
	movaps	%xmm0,0x20(%rsp)
	pxor	%xmm10,%xmm10
	movaps	%xmm0,0x30(%rsp)
	pxor	%xmm11,%xmm11
	movaps	%xmm0,0x40(%rsp)
	pxor	%xmm12,%xmm12
	movaps	%xmm0,0x50(%rsp)
	pxor	%xmm13,%xmm13
	movaps	%xmm0,0x60(%rsp)
	pxor	%xmm14,%xmm14
	pxor	%xmm15,%xmm15
___
$code.=<<___ if ($win64);
	movaps	-0xa8(%r11),%xmm6
	movaps	%xmm0,-0xa8(%r11)		# clear stack
	movaps	-0x98(%r11),%xmm7
	movaps	%xmm0,-0x98(%r11)
	movaps	-0x88(%r11),%xmm8
	movaps	%xmm0,-0x88(%r11)
	movaps	-0x78(%r11),%xmm9
	movaps	%xmm0,-0x78(%r11)
	movaps	-0x68(%r11),%xmm10
	movaps	%xmm0,-0x68(%r11)
	movaps	-0x58(%r11),%xmm11
	movaps	%xmm0,-0x58(%r11)
	movaps	-0x48(%r11),%xmm12
	movaps	%xmm0,-0x48(%r11)
	movaps	-0x38(%r11),%xmm13
	movaps	%xmm0,-0x38(%r11)
	movaps	-0x28(%r11),%xmm14
	movaps	%xmm0,-0x28(%r11)
	movaps	-0x18(%r11),%xmm15
	movaps	%xmm0,-0x18(%r11)
	movaps	%xmm0,0x00(%rsp)
	movaps	%xmm0,0x10(%rsp)
	movaps	%xmm0,0x20(%rsp)
	movaps	%xmm0,0x30(%rsp)
	movaps	%xmm0,0x40(%rsp)
	movaps	%xmm0,0x50(%rsp)
	movaps	%xmm0,0x60(%rsp)
___
$code.=<<___;
	mov	-8(%r11),%rbp
.cfi_restore	%rbp
	lea	(%r11),%rsp
.cfi_def_cfa_register	%rsp
.Lxts_dec_epilogue:
	ret
.cfi_endproc
.size	${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt