sub $rptr,%r11()

in ring/crypto/fipsmodule/bn/asm/x86_64-mont5.pl [2140:2439]


	sub	$rptr,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lfrom_sp_alt
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	jmp	.Lfrom_sp_done

.align	32
.Lfrom_sp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rbp
.Lfrom_sp_done:
	and	\$-64,%rbp
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lfrom_page_walk
	jmp	.Lfrom_page_walk_done

.Lfrom_page_walk:
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lfrom_page_walk
.Lfrom_page_walk_done:

	mov	$num,%r10
	neg	$num

	##############################################################
	# Stack layout
	#
	# +0	saved $num, used in reduction section
	# +8	&t[2*$num], used in reduction section
	# +32	saved *n0
	# +40	saved %rsp
	# +48	t[2*$num]
	#
	mov	$n0,  32(%rsp)
	mov	%rax, 40(%rsp)		# save original %rsp
.cfi_cfa_expression	%rsp+40,deref,+8
.Lfrom_body:
	mov	$num,%r11
	lea	48(%rsp),%rax
	pxor	%xmm0,%xmm0
	jmp	.Lmul_by_1

.align	32
.Lmul_by_1:
	movdqu	($aptr),%xmm1
	movdqu	16($aptr),%xmm2
	movdqu	32($aptr),%xmm3
	movdqa	%xmm0,(%rax,$num)
	movdqu	48($aptr),%xmm4
	movdqa	%xmm0,16(%rax,$num)
	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
	movdqa	%xmm1,(%rax)
	movdqa	%xmm0,32(%rax,$num)
	movdqa	%xmm2,16(%rax)
	movdqa	%xmm0,48(%rax,$num)
	movdqa	%xmm3,32(%rax)
	movdqa	%xmm4,48(%rax)
	lea	64(%rax),%rax
	sub	\$64,%r11
	jnz	.Lmul_by_1

	movq	$rptr,%xmm1
	movq	$nptr,%xmm2
	.byte	0x67
	mov	$nptr,%rbp
	movq	%r10, %xmm3		# -num
___
$code.=<<___ if ($addx);
	leaq	GFp_ia32cap_P(%rip),%r11
	mov	8(%r11),%r11d
	and	\$0x80108,%r11d
	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
	jne	.Lfrom_mont_nox

	lea	(%rax,$num),$rptr
	call	__bn_sqrx8x_reduction
	call	__bn_postx4x_internal

	pxor	%xmm0,%xmm0
	lea	48(%rsp),%rax
	jmp	.Lfrom_mont_zero

.align	32
.Lfrom_mont_nox:
___
$code.=<<___;
	call	__bn_sqr8x_reduction
	call	__bn_post4x_internal

	pxor	%xmm0,%xmm0
	lea	48(%rsp),%rax
	jmp	.Lfrom_mont_zero

.align	32
.Lfrom_mont_zero:
	mov	40(%rsp),%rsi		# restore %rsp
.cfi_def_cfa	%rsi,8
	movdqa	%xmm0,16*0(%rax)
	movdqa	%xmm0,16*1(%rax)
	movdqa	%xmm0,16*2(%rax)
	movdqa	%xmm0,16*3(%rax)
	lea	16*4(%rax),%rax
	sub	\$32,$num
	jnz	.Lfrom_mont_zero

	mov	\$1,%rax
	mov	-48(%rsi),%r15
.cfi_restore	%r15
	mov	-40(%rsi),%r14
.cfi_restore	%r14
	mov	-32(%rsi),%r13
.cfi_restore	%r13
	mov	-24(%rsi),%r12
.cfi_restore	%r12
	mov	-16(%rsi),%rbp
.cfi_restore	%rbp
	mov	-8(%rsi),%rbx
.cfi_restore	%rbx
	lea	(%rsi),%rsp
.cfi_def_cfa_register	%rsp
.Lfrom_epilogue:
	ret
.cfi_endproc
.size	bn_from_mont8x,.-bn_from_mont8x
___
}
}}}

if ($addx) {{{
my $bp="%rdx";	# restore original value

$code.=<<___;
.type	bn_mulx4x_mont_gather5,\@function,6
.align	32
bn_mulx4x_mont_gather5:
.cfi_startproc
	mov	%rsp,%rax
.cfi_def_cfa_register	%rax
.Lmulx4x_enter:
	push	%rbx
.cfi_push	%rbx
	push	%rbp
.cfi_push	%rbp
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lmulx4x_prologue:

	shl	\$3,${num}d		# convert $num to bytes
	lea	($num,$num,2),%r10	# 3*$num in bytes
	neg	$num			# -$num
	mov	($n0),$n0		# *n0

	##############################################################
	# Ensure that stack frame doesn't alias with $rptr+3*$num
	# modulo 4096, which covers ret[num], am[num] and n[num]
	# (see bn_exp.c). This is done to allow memory disambiguation
	# logic do its magic. [Extra [num] is allocated in order
	# to align with GFp_bn_power5's frame, which is cleansed after
	# completing exponentiation. Extra 256 bytes is for power mask
	# calculated from 7th argument, the index.]
	#
	lea	-320(%rsp,$num,2),%r11
	mov	%rsp,%rbp
	sub	$rp,%r11
	and	\$4095,%r11
	cmp	%r11,%r10
	jb	.Lmulx4xsp_alt
	sub	%r11,%rbp		# align with $aptr
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	jmp	.Lmulx4xsp_done

.Lmulx4xsp_alt:
	lea	4096-320(,$num,2),%r10
	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
	sub	%r10,%r11
	mov	\$0,%r10
	cmovc	%r10,%r11
	sub	%r11,%rbp
.Lmulx4xsp_done:
	and	\$-64,%rbp		# ensure alignment
	mov	%rsp,%r11
	sub	%rbp,%r11
	and	\$-4096,%r11
	lea	(%rbp,%r11),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
	jmp	.Lmulx4x_page_walk_done

.Lmulx4x_page_walk:
	lea	-4096(%rsp),%rsp
	mov	(%rsp),%r10
	cmp	%rbp,%rsp
	ja	.Lmulx4x_page_walk
.Lmulx4x_page_walk_done:

	##############################################################
	# Stack layout
	# +0	-num
	# +8	off-loaded &b[i]
	# +16	end of b[num]
	# +24	inner counter
	# +32	saved n0
	# +40	saved %rsp
	# +48
	# +56	saved rp
	# +64	tmp[num+1]
	#
	mov	$n0, 32(%rsp)		# save *n0
	mov	%rax,40(%rsp)		# save original %rsp
.cfi_cfa_expression	%rsp+40,deref,+8
.Lmulx4x_body:
	call	mulx4x_internal

	mov	40(%rsp),%rsi		# restore %rsp
.cfi_def_cfa	%rsi,8
	mov	\$1,%rax

	mov	-48(%rsi),%r15
.cfi_restore	%r15
	mov	-40(%rsi),%r14
.cfi_restore	%r14
	mov	-32(%rsi),%r13
.cfi_restore	%r13
	mov	-24(%rsi),%r12
.cfi_restore	%r12
	mov	-16(%rsi),%rbp
.cfi_restore	%rbp
	mov	-8(%rsi),%rbx
.cfi_restore	%rbx
	lea	(%rsi),%rsp
.cfi_def_cfa_register	%rsp
.Lmulx4x_epilogue:
	ret
.cfi_endproc
.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5

.type	mulx4x_internal,\@abi-omnipotent
.align	32
mulx4x_internal:
.cfi_startproc
	mov	$num,8(%rsp)		# save -$num (it was in bytes)
	mov	$num,%r10
	neg	$num			# restore $num
	shl	\$5,$num
	neg	%r10			# restore $num
	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
	shr	\$5+5,$num
	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
	sub	\$1,$num
	lea	.Linc(%rip),%rax
	mov	%r13,16+8(%rsp)		# end of b[num]
	mov	$num,24+8(%rsp)		# inner counter
	mov	$rp, 56+8(%rsp)		# save $rp
___
my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
my $rptr=$bptr;
my $STRIDE=2**5*8;		# 5 is "window size"
my $N=$STRIDE/4;		# should match cache line size
$code.=<<___;
	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimization)
	lea	128($bp),$bptr		# size optimization

	pshufd	\$0,%xmm5,%xmm5		# broadcast index
	movdqa	%xmm1,%xmm4
	.byte	0x67
	movdqa	%xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to index and save result to stack
#
$code.=<<___;
	.byte	0x67
	paddd	%xmm0,%xmm1
	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
	movdqa	%xmm4,%xmm3
___
for($i=0;$i<$STRIDE/16-4;$i+=4) {