sub $acc0, $acc2()

in ring/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl [277:2129]


	sub	$acc0, $acc2
	sbb	\$0, $acc0		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc1
	adc	\$0, %rdx
	add	%rax, $acc1
	mov	$t1, %rax
	adc	%rdx, $acc2
	mov	$t1, %rdx
	adc	\$0, $acc0		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc3
	 mov	8*1($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc0, $acc3
	adc	$t1, $acc4
	adc	\$0, $acc5

	################################# * b[1]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc1
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc1, $t0
	 imulq	%r15, $acc1

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	xor	$acc0, $acc0
	add	%rax, $acc4
	 mov	$acc1, %rax
	adc	%rdx, $acc5
	adc	\$0, $acc0

	################################# Second reduction step
	mulq	8*0(%r14)
	mov	$acc1, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc1, %rax
	adc	%rdx, $t0

	sub	$acc1, $acc3
	sbb	\$0, $acc1		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$t1, %rax
	adc	%rdx, $acc3
	mov	$t1, %rdx
	adc	\$0, $acc1		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc4
	 mov	8*2($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc1, $acc4
	adc	$t1, $acc5
	adc	\$0, $acc0

	################################## * b[2]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc2
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc2, $t0
	 imulq	%r15, $acc2

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc5
	adc	\$0, %rdx
	xor	$acc1, $acc1
	add	%rax, $acc5
	 mov	$acc2, %rax
	adc	%rdx, $acc0
	adc	\$0, $acc1

	################################# Third reduction step
	mulq	8*0(%r14)
	mov	$acc2, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc2, %rax
	adc	%rdx, $t0

	sub	$acc2, $acc4
	sbb	\$0, $acc2		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t1, %rax
	adc	%rdx, $acc4
	mov	$t1, %rdx
	adc	\$0, $acc2		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc5
	 mov	8*3($b_ptr), %rax
	sbb	%rdx, $t1		# can't borrow

	add	$acc2, $acc5
	adc	$t1, $acc0
	adc	\$0, $acc1

	################################# * b[3]
	mov	%rax, $t0
	mulq	8*0($a_ptr)
	add	%rax, $acc3
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*1($a_ptr)
	add	$t1, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	8*2($a_ptr)
	add	$t1, $acc5
	adc	\$0, %rdx
	add	%rax, $acc5
	mov	$t0, %rax
	adc	\$0, %rdx

	 mov	$acc3, $t0
	 imulq	%r15, $acc3

	mov	%rdx, $t1
	mulq	8*3($a_ptr)
	add	$t1, $acc0
	adc	\$0, %rdx
	xor	$acc2, $acc2
	add	%rax, $acc0
	 mov	$acc3, %rax
	adc	%rdx, $acc1
	adc	\$0, $acc2

	################################# Last reduction step
	mulq	8*0(%r14)
	mov	$acc3, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	$acc3, %rax
	adc	%rdx, $t0

	sub	$acc3, $acc5
	sbb	\$0, $acc3		# can't borrow

	mulq	8*1(%r14)
	add	$t0, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t1, %rax
	adc	%rdx, $acc5
	mov	$t1, %rdx
	adc	\$0, $acc3		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc0
	sbb	%rdx, $t1		# can't borrow

	add	$acc3, $acc0
	adc	$t1, $acc1
	adc	\$0, $acc2

	################################# Subtract ord
	 mov	$acc4, $a_ptr
	sub	8*0(%r14), $acc4
	 mov	$acc5, $acc3
	sbb	8*1(%r14), $acc5
	 mov	$acc0, $t0
	sbb	8*2(%r14), $acc0
	 mov	$acc1, $t1
	sbb	8*3(%r14), $acc1
	sbb	\$0, $acc2

	cmovc	$a_ptr, $acc4
	cmovc	$acc3, $acc5
	cmovc	$t0, $acc0
	cmovc	$t1, $acc1

	mov	$acc4, 8*0($r_ptr)
	mov	$acc5, 8*1($r_ptr)
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lord_mul_epilogue:
	ret
.cfi_endproc
.size	GFp_p256_scalar_mul_mont,.-GFp_p256_scalar_mul_mont

################################################################################
# void GFp_p256_scalar_sqr_rep_mont(
#   uint64_t res[4],
#   uint64_t a[4],
#   uint64_t rep);

.globl	GFp_p256_scalar_sqr_rep_mont
.type	GFp_p256_scalar_sqr_rep_mont,\@function,3
.align	32
GFp_p256_scalar_sqr_rep_mont:
.cfi_startproc
___
$code.=<<___	if ($addx);
	leaq	GFp_ia32cap_P(%rip), %rcx
	mov	8(%rcx), %rcx
	and	\$0x80100, %ecx
	cmp	\$0x80100, %ecx
	je	.Lecp_nistz256_ord_sqr_montx
___
$code.=<<___;
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lord_sqr_body:

	mov	8*0($a_ptr), $acc0
	mov	8*1($a_ptr), %rax
	mov	8*2($a_ptr), $acc6
	mov	8*3($a_ptr), $acc7
	lea	.Lord(%rip), $a_ptr	# pointer to modulus
	mov	$b_org, $b_ptr
	jmp	.Loop_ord_sqr

.align	32
.Loop_ord_sqr:
	################################# a[1:] * a[0]
	mov	%rax, $t1		# put aside a[1]
	mul	$acc0			# a[1] * a[0]
	mov	%rax, $acc1
	movq	$t1, %xmm1		# offload a[1]
	mov	$acc6, %rax
	mov	%rdx, $acc2

	mul	$acc0			# a[2] * a[0]
	add	%rax, $acc2
	mov	$acc7, %rax
	movq	$acc6, %xmm2		# offload a[2]
	adc	\$0, %rdx
	mov	%rdx, $acc3

	mul	$acc0			# a[3] * a[0]
	add	%rax, $acc3
	mov	$acc7, %rax
	movq	$acc7, %xmm3		# offload a[3]
	adc	\$0, %rdx
	mov	%rdx, $acc4

	################################# a[3] * a[2]
	mul	$acc6			# a[3] * a[2]
	mov	%rax, $acc5
	mov	$acc6, %rax
	mov	%rdx, $acc6

	################################# a[2:] * a[1]
	mul	$t1			# a[2] * a[1]
	add	%rax, $acc3
	mov	$acc7, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc7

	mul	$t1			# a[3] * a[1]
	add	%rax, $acc4
	adc	\$0, %rdx

	add	$acc7, $acc4
	adc	%rdx, $acc5
	adc	\$0, $acc6		# can't overflow

	################################# *2
	xor	$acc7, $acc7
	mov	$acc0, %rax
	add	$acc1, $acc1
	adc	$acc2, $acc2
	adc	$acc3, $acc3
	adc	$acc4, $acc4
	adc	$acc5, $acc5
	adc	$acc6, $acc6
	adc	\$0, $acc7

	################################# Missing products
	mul	%rax			# a[0] * a[0]
	mov	%rax, $acc0
	movq	%xmm1, %rax
	mov	%rdx, $t1

	mul	%rax			# a[1] * a[1]
	add	$t1, $acc1
	adc	%rax, $acc2
	movq	%xmm2, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mul	%rax			# a[2] * a[2]
	add	$t1, $acc3
	adc	%rax, $acc4
	movq	%xmm3, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	 mov	$acc0, $t0
	 imulq	8*4($a_ptr), $acc0	# *= .LordK

	mul	%rax			# a[3] * a[3]
	add	$t1, $acc5
	adc	%rax, $acc6
	 mov	8*0($a_ptr), %rax	# modulus[0]
	adc	%rdx, $acc7		# can't overflow

	################################# First reduction step
	mul	$acc0
	mov	$acc0, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax	# modulus[1]
	adc	%rdx, $t0

	sub	$acc0, $acc2
	sbb	\$0, $t1		# can't borrow

	mul	$acc0
	add	$t0, $acc1
	adc	\$0, %rdx
	add	%rax, $acc1
	mov	$acc0, %rax
	adc	%rdx, $acc2
	mov	$acc0, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc1, $t0
	 imulq	8*4($a_ptr), $acc1	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc3
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc0		# can't borrow

	add	$t1, $acc3
	adc	\$0, $acc0		# can't overflow

	################################# Second reduction step
	mul	$acc1
	mov	$acc1, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc1, $acc3
	sbb	\$0, $t1		# can't borrow

	mul	$acc1
	add	$t0, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$acc1, %rax
	adc	%rdx, $acc3
	mov	$acc1, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc2, $t0
	 imulq	8*4($a_ptr), $acc2	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc0
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc1		# can't borrow

	add	$t1, $acc0
	adc	\$0, $acc1		# can't overflow

	################################# Third reduction step
	mul	$acc2
	mov	$acc2, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc2, $acc0
	sbb	\$0, $t1		# can't borrow

	mul	$acc2
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$acc2, %rax
	adc	%rdx, $acc0
	mov	$acc2, %rdx
	adc	\$0, $t1		# can't overflow

	 mov	$acc3, $t0
	 imulq	8*4($a_ptr), $acc3	# *= .LordK

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc1
	 mov	8*0($a_ptr), %rax
	sbb	%rdx, $acc2		# can't borrow

	add	$t1, $acc1
	adc	\$0, $acc2		# can't overflow

	################################# Last reduction step
	mul	$acc3
	mov	$acc3, $t1
	add	%rax, $t0		# guaranteed to be zero
	mov	8*1($a_ptr), %rax
	adc	%rdx, $t0

	sub	$acc3, $acc1
	sbb	\$0, $t1		# can't borrow

	mul	$acc3
	add	$t0, $acc0
	adc	\$0, %rdx
	add	%rax, $acc0
	mov	$acc3, %rax
	adc	%rdx, $acc1
	mov	$acc3, %rdx
	adc	\$0, $t1		# can't overflow

	shl	\$32, %rax
	shr	\$32, %rdx
	sub	%rax, $acc2
	sbb	%rdx, $acc3		# can't borrow

	add	$t1, $acc2
	adc	\$0, $acc3		# can't overflow

	################################# Add bits [511:256] of the sqr result
	xor	%rdx, %rdx
	add	$acc4, $acc0
	adc	$acc5, $acc1
	 mov	$acc0, $acc4
	adc	$acc6, $acc2
	adc	$acc7, $acc3
	 mov	$acc1, %rax
	adc	\$0, %rdx

	################################# Compare to modulus
	sub	8*0($a_ptr), $acc0
	 mov	$acc2, $acc6
	sbb	8*1($a_ptr), $acc1
	sbb	8*2($a_ptr), $acc2
	 mov	$acc3, $acc7
	sbb	8*3($a_ptr), $acc3
	sbb	\$0, %rdx

	cmovc	$acc4, $acc0
	cmovnc	$acc1, %rax
	cmovnc	$acc2, $acc6
	cmovnc	$acc3, $acc7

	dec	$b_ptr
	jnz	.Loop_ord_sqr

	mov	$acc0, 8*0($r_ptr)
	mov	%rax,  8*1($r_ptr)
	pxor	%xmm1, %xmm1
	mov	$acc6, 8*2($r_ptr)
	pxor	%xmm2, %xmm2
	mov	$acc7, 8*3($r_ptr)
	pxor	%xmm3, %xmm3

	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lord_sqr_epilogue:
	ret
.cfi_endproc
.size	GFp_p256_scalar_sqr_rep_mont,.-GFp_p256_scalar_sqr_rep_mont
___

$code.=<<___	if ($addx);
################################################################################
.type	ecp_nistz256_ord_mul_montx,\@function,3
.align	32
ecp_nistz256_ord_mul_montx:
.cfi_startproc
.Lecp_nistz256_ord_mul_montx:
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lord_mulx_body:

	mov	$b_org, $b_ptr
	mov	8*0($b_org), %rdx
	mov	8*0($a_ptr), $acc1
	mov	8*1($a_ptr), $acc2
	mov	8*2($a_ptr), $acc3
	mov	8*3($a_ptr), $acc4
	lea	-128($a_ptr), $a_ptr	# control u-op density
	lea	.Lord-128(%rip), %r14
	mov	.LordK(%rip), %r15

	################################# Multiply by b[0]
	mulx	$acc1, $acc0, $acc1
	mulx	$acc2, $t0, $acc2
	mulx	$acc3, $t1, $acc3
	add	$t0, $acc1
	mulx	$acc4, $t0, $acc4
	 mov	$acc0, %rdx
	 mulx	%r15, %rdx, %rax
	adc	$t1, $acc2
	adc	$t0, $acc3
	adc	\$0, $acc4

	################################# reduction
	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc0		# guaranteed to be zero
	adox	$t1, $acc1

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*1($b_ptr), %rdx
	adcx	$t0, $acc3
	adox	$t1, $acc4
	adcx	$acc0, $acc4
	adox	$acc0, $acc5
	adc	\$0, $acc5		# cf=0, of=0

	################################# Multiply by b[1]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc1, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc4
	adox	$t1, $acc5

	adcx	$acc0, $acc5
	adox	$acc0, $acc0
	adc	\$0, $acc0		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc1		# guaranteed to be zero
	adox	$t1, $acc2

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*2($b_ptr), %rdx
	adcx	$t0, $acc4
	adox	$t1, $acc5
	adcx	$acc1, $acc5
	adox	$acc1, $acc0
	adc	\$0, $acc0		# cf=0, of=0

	################################# Multiply by b[2]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc2, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc5
	adox	$t1, $acc0

	adcx	$acc1, $acc0
	adox	$acc1, $acc1
	adc	\$0, $acc1		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc2		# guaranteed to be zero
	adox	$t1, $acc3

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*3+128(%r14), $t0, $t1
	 mov	8*3($b_ptr), %rdx
	adcx	$t0, $acc5
	adox	$t1, $acc0
	adcx	$acc2, $acc0
	adox	$acc2, $acc1
	adc	\$0, $acc1		# cf=0, of=0

	################################# Multiply by b[3]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc5
	adox	$t1, $acc0

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc3, %rdx
	 mulx	%r15, %rdx, %rax
	adcx	$t0, $acc0
	adox	$t1, $acc1

	adcx	$acc2, $acc1
	adox	$acc2, $acc2
	adc	\$0, $acc2		# cf=0, of=0

	################################# reduction
	mulx	8*0+128(%r14), $t0, $t1
	adcx	$t0, $acc3		# guranteed to be zero
	adox	$t1, $acc4

	mulx	8*1+128(%r14), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*2+128(%r14), $t0, $t1
	adcx	$t0, $acc5
	adox	$t1, $acc0

	mulx	8*3+128(%r14), $t0, $t1
	lea	128(%r14),%r14
	 mov	$acc4, $t2
	adcx	$t0, $acc0
	adox	$t1, $acc1
	 mov	$acc5, $t3
	adcx	$acc3, $acc1
	adox	$acc3, $acc2
	adc	\$0, $acc2

	#################################
	# Branch-less conditional subtraction of P
	 mov	$acc0, $t0
	sub	8*0(%r14), $acc4
	sbb	8*1(%r14), $acc5
	sbb	8*2(%r14), $acc0
	 mov	$acc1, $t1
	sbb	8*3(%r14), $acc1
	sbb	\$0, $acc2

	cmovc	$t2, $acc4
	cmovc	$t3, $acc5
	cmovc	$t0, $acc0
	cmovc	$t1, $acc1

	mov	$acc4, 8*0($r_ptr)
	mov	$acc5, 8*1($r_ptr)
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lord_mulx_epilogue:
	ret
.cfi_endproc
.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx

.type	ecp_nistz256_ord_sqr_montx,\@function,3
.align	32
ecp_nistz256_ord_sqr_montx:
.cfi_startproc
.Lecp_nistz256_ord_sqr_montx:
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lord_sqrx_body:

	mov	$b_org, $b_ptr
	mov	8*0($a_ptr), %rdx
	mov	8*1($a_ptr), $acc6
	mov	8*2($a_ptr), $acc7
	mov	8*3($a_ptr), $acc0
	lea	.Lord(%rip), $a_ptr
	jmp	.Loop_ord_sqrx

.align	32
.Loop_ord_sqrx:
	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
	 mov	%rdx, %rax		# offload a[0]
	 movq	$acc6, %xmm1		# offload a[1]
	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
	 mov	$acc6, %rdx
	add	$t0, $acc2
	 movq	$acc7, %xmm2		# offload a[2]
	adc	$t1, $acc3
	adc	\$0, $acc4
	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
	#################################
	mulx	$acc7, $t0, $t1		# a[1]*a[2]
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	$acc0, $t0, $t1		# a[1]*a[3]
	 mov	$acc7, %rdx
	adcx	$t0, $acc4
	adox	$t1, $acc5
	adc	\$0, $acc5
	#################################
	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
	mov	%rax, %rdx
	 movq	$acc0, %xmm3		# offload a[3]
	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
	 adcx	$acc1, $acc1		# acc1:6<<1
	adox	$t0, $acc5
	 adcx	$acc2, $acc2
	adox	$acc7, $acc6		# of=0

	################################# a[i]*a[i]
	mulx	%rdx, $acc0, $t1
	movq	%xmm1, %rdx
	 adcx	$acc3, $acc3
	adox	$t1, $acc1
	 adcx	$acc4, $acc4
	mulx	%rdx, $t0, $t4
	movq	%xmm2, %rdx
	 adcx	$acc5, $acc5
	adox	$t0, $acc2
	 adcx	$acc6, $acc6
	mulx	%rdx, $t0, $t1
	.byte	0x67
	movq	%xmm3, %rdx
	adox	$t4, $acc3
	 adcx	$acc7, $acc7
	adox	$t0, $acc4
	adox	$t1, $acc5
	mulx	%rdx, $t0, $t4
	adox	$t0, $acc6
	adox	$t4, $acc7

	################################# reduction
	mov	$acc0, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	xor	%rax, %rax		# cf=0, of=0
	mulx	8*0($a_ptr), $t0, $t1
	adcx	$t0, $acc0		# guaranteed to be zero
	adox	$t1, $acc1
	mulx	8*1($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2
	mulx	8*2($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3
	mulx	8*3($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc0		# of=0
	adcx	%rax, $acc0		# cf=0

	#################################
	mov	$acc1, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adox	$t0, $acc1		# guaranteed to be zero
	adcx	$t1, $acc2
	mulx	8*1($a_ptr), $t0, $t1
	adox	$t0, $acc2
	adcx	$t1, $acc3
	mulx	8*2($a_ptr), $t0, $t1
	adox	$t0, $acc3
	adcx	$t1, $acc0
	mulx	8*3($a_ptr), $t0, $t1
	adox	$t0, $acc0
	adcx	$t1, $acc1		# cf=0
	adox	%rax, $acc1		# of=0

	#################################
	mov	$acc2, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adcx	$t0, $acc2		# guaranteed to be zero
	adox	$t1, $acc3
	mulx	8*1($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc0
	mulx	8*2($a_ptr), $t0, $t1
	adcx	$t0, $acc0
	adox	$t1, $acc1
	mulx	8*3($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2		# of=0
	adcx	%rax, $acc2		# cf=0

	#################################
	mov	$acc3, %rdx
	mulx	8*4($a_ptr), %rdx, $t0

	mulx	8*0($a_ptr), $t0, $t1
	adox	$t0, $acc3		# guaranteed to be zero
	adcx	$t1, $acc0
	mulx	8*1($a_ptr), $t0, $t1
	adox	$t0, $acc0
	adcx	$t1, $acc1
	mulx	8*2($a_ptr), $t0, $t1
	adox	$t0, $acc1
	adcx	$t1, $acc2
	mulx	8*3($a_ptr), $t0, $t1
	adox	$t0, $acc2
	adcx	$t1, $acc3
	adox	%rax, $acc3

	################################# accumulate upper half
	add	$acc0, $acc4		# add	$acc4, $acc0
	adc	$acc5, $acc1
	 mov	$acc4, %rdx
	adc	$acc6, $acc2
	adc	$acc7, $acc3
	 mov	$acc1, $acc6
	adc	\$0, %rax

	################################# compare to modulus
	sub	8*0($a_ptr), $acc4
	 mov	$acc2, $acc7
	sbb	8*1($a_ptr), $acc1
	sbb	8*2($a_ptr), $acc2
	 mov	$acc3, $acc0
	sbb	8*3($a_ptr), $acc3
	sbb	\$0, %rax

	cmovnc	$acc4, %rdx
	cmovnc	$acc1, $acc6
	cmovnc	$acc2, $acc7
	cmovnc	$acc3, $acc0

	dec	$b_ptr
	jnz	.Loop_ord_sqrx

	mov	%rdx, 8*0($r_ptr)
	mov	$acc6, 8*1($r_ptr)
	pxor	%xmm1, %xmm1
	mov	$acc7, 8*2($r_ptr)
	pxor	%xmm2, %xmm2
	mov	$acc0, 8*3($r_ptr)
	pxor	%xmm3, %xmm3

	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lord_sqrx_epilogue:
	ret
.cfi_endproc
.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
___

$code.=<<___;
################################################################################
# void GFp_nistz256_mul_mont(
#   uint64_t res[4],
#   uint64_t a[4],
#   uint64_t b[4]);

.globl	GFp_nistz256_mul_mont
.type	GFp_nistz256_mul_mont,\@function,3
.align	32
GFp_nistz256_mul_mont:
.cfi_startproc
___
$code.=<<___	if ($addx);
	leaq	GFp_ia32cap_P(%rip), %rcx
	mov	8(%rcx), %rcx
	and	\$0x80100, %ecx
___
$code.=<<___;
.Lmul_mont:
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lmul_body:
___
$code.=<<___	if ($addx);
	cmp	\$0x80100, %ecx
	je	.Lmul_montx
___
$code.=<<___;
	mov	$b_org, $b_ptr
	mov	8*0($b_org), %rax
	mov	8*0($a_ptr), $acc1
	mov	8*1($a_ptr), $acc2
	mov	8*2($a_ptr), $acc3
	mov	8*3($a_ptr), $acc4

	call	__ecp_nistz256_mul_montq
___
$code.=<<___	if ($addx);
	jmp	.Lmul_mont_done

.align	32
.Lmul_montx:
	mov	$b_org, $b_ptr
	mov	8*0($b_org), %rdx
	mov	8*0($a_ptr), $acc1
	mov	8*1($a_ptr), $acc2
	mov	8*2($a_ptr), $acc3
	mov	8*3($a_ptr), $acc4
	lea	-128($a_ptr), $a_ptr	# control u-op density

	call	__ecp_nistz256_mul_montx
___
$code.=<<___;
.Lmul_mont_done:
	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lmul_epilogue:
	ret
.cfi_endproc
.size	GFp_nistz256_mul_mont,.-GFp_nistz256_mul_mont

.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
.align	32
__ecp_nistz256_mul_montq:
.cfi_startproc
	########################################################################
	# Multiply a by b[0]
	mov	%rax, $t1
	mulq	$acc1
	mov	.Lpoly+8*1(%rip),$poly1
	mov	%rax, $acc0
	mov	$t1, %rax
	mov	%rdx, $acc1

	mulq	$acc2
	mov	.Lpoly+8*3(%rip),$poly3
	add	%rax, $acc1
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc2

	mulq	$acc3
	add	%rax, $acc2
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc3

	mulq	$acc4
	add	%rax, $acc3
	 mov	$acc0, %rax
	adc	\$0, %rdx
	xor	$acc5, $acc5
	mov	%rdx, $acc4

	########################################################################
	# First reduction step
	# Basically now we want to multiply acc[0] by p256,
	# and add the result to the acc.
	# Due to the special form of p256 we do some optimizations
	#
	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
	# then we add acc[0] and get acc[0] x 2^96

	mov	$acc0, $t1
	shl	\$32, $acc0
	mulq	$poly3
	shr	\$32, $t1
	add	$acc0, $acc1		# +=acc[0]<<96
	adc	$t1, $acc2
	adc	%rax, $acc3
	 mov	8*1($b_ptr), %rax
	adc	%rdx, $acc4
	adc	\$0, $acc5
	xor	$acc0, $acc0

	########################################################################
	# Multiply by b[1]
	mov	%rax, $t1
	mulq	8*0($a_ptr)
	add	%rax, $acc1
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*1($a_ptr)
	add	$t0, $acc2
	adc	\$0, %rdx
	add	%rax, $acc2
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*2($a_ptr)
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*3($a_ptr)
	add	$t0, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	 mov	$acc1, %rax
	adc	%rdx, $acc5
	adc	\$0, $acc0

	########################################################################
	# Second reduction step
	mov	$acc1, $t1
	shl	\$32, $acc1
	mulq	$poly3
	shr	\$32, $t1
	add	$acc1, $acc2
	adc	$t1, $acc3
	adc	%rax, $acc4
	 mov	8*2($b_ptr), %rax
	adc	%rdx, $acc5
	adc	\$0, $acc0
	xor	$acc1, $acc1

	########################################################################
	# Multiply by b[2]
	mov	%rax, $t1
	mulq	8*0($a_ptr)
	add	%rax, $acc2
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*1($a_ptr)
	add	$t0, $acc3
	adc	\$0, %rdx
	add	%rax, $acc3
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*2($a_ptr)
	add	$t0, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*3($a_ptr)
	add	$t0, $acc5
	adc	\$0, %rdx
	add	%rax, $acc5
	 mov	$acc2, %rax
	adc	%rdx, $acc0
	adc	\$0, $acc1

	########################################################################
	# Third reduction step
	mov	$acc2, $t1
	shl	\$32, $acc2
	mulq	$poly3
	shr	\$32, $t1
	add	$acc2, $acc3
	adc	$t1, $acc4
	adc	%rax, $acc5
	 mov	8*3($b_ptr), %rax
	adc	%rdx, $acc0
	adc	\$0, $acc1
	xor	$acc2, $acc2

	########################################################################
	# Multiply by b[3]
	mov	%rax, $t1
	mulq	8*0($a_ptr)
	add	%rax, $acc3
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*1($a_ptr)
	add	$t0, $acc4
	adc	\$0, %rdx
	add	%rax, $acc4
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*2($a_ptr)
	add	$t0, $acc5
	adc	\$0, %rdx
	add	%rax, $acc5
	mov	$t1, %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	8*3($a_ptr)
	add	$t0, $acc0
	adc	\$0, %rdx
	add	%rax, $acc0
	 mov	$acc3, %rax
	adc	%rdx, $acc1
	adc	\$0, $acc2

	########################################################################
	# Final reduction step
	mov	$acc3, $t1
	shl	\$32, $acc3
	mulq	$poly3
	shr	\$32, $t1
	add	$acc3, $acc4
	adc	$t1, $acc5
	 mov	$acc4, $t0
	adc	%rax, $acc0
	adc	%rdx, $acc1
	 mov	$acc5, $t1
	adc	\$0, $acc2

	########################################################################
	# Branch-less conditional subtraction of P
	sub	\$-1, $acc4		# .Lpoly[0]
	 mov	$acc0, $t2
	sbb	$poly1, $acc5		# .Lpoly[1]
	sbb	\$0, $acc0		# .Lpoly[2]
	 mov	$acc1, $t3
	sbb	$poly3, $acc1		# .Lpoly[3]
	sbb	\$0, $acc2

	cmovc	$t0, $acc4
	cmovc	$t1, $acc5
	mov	$acc4, 8*0($r_ptr)
	cmovc	$t2, $acc0
	mov	$acc5, 8*1($r_ptr)
	cmovc	$t3, $acc1
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	ret
.cfi_endproc
.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq

################################################################################
# void GFp_nistz256_sqr_mont(
#   uint64_t res[4],
#   uint64_t a[4]);

# we optimize the square according to S.Gueron and V.Krasnov,
# "Speeding up Big-Number Squaring"
.globl	GFp_nistz256_sqr_mont
.type	GFp_nistz256_sqr_mont,\@function,2
.align	32
GFp_nistz256_sqr_mont:
.cfi_startproc
___
$code.=<<___	if ($addx);
	leaq	GFp_ia32cap_P(%rip), %rcx
	mov	8(%rcx), %rcx
	and	\$0x80100, %ecx
___
$code.=<<___;
	push	%rbp
.cfi_push	%rbp
	push	%rbx
.cfi_push	%rbx
	push	%r12
.cfi_push	%r12
	push	%r13
.cfi_push	%r13
	push	%r14
.cfi_push	%r14
	push	%r15
.cfi_push	%r15
.Lsqr_body:
___
$code.=<<___	if ($addx);
	cmp	\$0x80100, %ecx
	je	.Lsqr_montx
___
$code.=<<___;
	mov	8*0($a_ptr), %rax
	mov	8*1($a_ptr), $acc6
	mov	8*2($a_ptr), $acc7
	mov	8*3($a_ptr), $acc0

	call	__ecp_nistz256_sqr_montq
___
$code.=<<___	if ($addx);
	jmp	.Lsqr_mont_done

.align	32
.Lsqr_montx:
	mov	8*0($a_ptr), %rdx
	mov	8*1($a_ptr), $acc6
	mov	8*2($a_ptr), $acc7
	mov	8*3($a_ptr), $acc0
	lea	-128($a_ptr), $a_ptr	# control u-op density

	call	__ecp_nistz256_sqr_montx
___
$code.=<<___;
.Lsqr_mont_done:
	mov	0(%rsp),%r15
.cfi_restore	%r15
	mov	8(%rsp),%r14
.cfi_restore	%r14
	mov	16(%rsp),%r13
.cfi_restore	%r13
	mov	24(%rsp),%r12
.cfi_restore	%r12
	mov	32(%rsp),%rbx
.cfi_restore	%rbx
	mov	40(%rsp),%rbp
.cfi_restore	%rbp
	lea	48(%rsp),%rsp
.cfi_adjust_cfa_offset	-48
.Lsqr_epilogue:
	ret
.cfi_endproc
.size	GFp_nistz256_sqr_mont,.-GFp_nistz256_sqr_mont

.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
.align	32
__ecp_nistz256_sqr_montq:
.cfi_startproc
	mov	%rax, $acc5
	mulq	$acc6			# a[1]*a[0]
	mov	%rax, $acc1
	mov	$acc7, %rax
	mov	%rdx, $acc2

	mulq	$acc5			# a[0]*a[2]
	add	%rax, $acc2
	mov	$acc0, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc3

	mulq	$acc5			# a[0]*a[3]
	add	%rax, $acc3
	 mov	$acc7, %rax
	adc	\$0, %rdx
	mov	%rdx, $acc4

	#################################
	mulq	$acc6			# a[1]*a[2]
	add	%rax, $acc3
	mov	$acc0, %rax
	adc	\$0, %rdx
	mov	%rdx, $t1

	mulq	$acc6			# a[1]*a[3]
	add	%rax, $acc4
	 mov	$acc0, %rax
	adc	\$0, %rdx
	add	$t1, $acc4
	mov	%rdx, $acc5
	adc	\$0, $acc5

	#################################
	mulq	$acc7			# a[2]*a[3]
	xor	$acc7, $acc7
	add	%rax, $acc5
	 mov	8*0($a_ptr), %rax
	mov	%rdx, $acc6
	adc	\$0, $acc6

	add	$acc1, $acc1		# acc1:6<<1
	adc	$acc2, $acc2
	adc	$acc3, $acc3
	adc	$acc4, $acc4
	adc	$acc5, $acc5
	adc	$acc6, $acc6
	adc	\$0, $acc7

	mulq	%rax
	mov	%rax, $acc0
	mov	8*1($a_ptr), %rax
	mov	%rdx, $t0

	mulq	%rax
	add	$t0, $acc1
	adc	%rax, $acc2
	mov	8*2($a_ptr), %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	%rax
	add	$t0, $acc3
	adc	%rax, $acc4
	mov	8*3($a_ptr), %rax
	adc	\$0, %rdx
	mov	%rdx, $t0

	mulq	%rax
	add	$t0, $acc5
	adc	%rax, $acc6
	 mov	$acc0, %rax
	adc	%rdx, $acc7

	mov	.Lpoly+8*1(%rip), $a_ptr
	mov	.Lpoly+8*3(%rip), $t1

	##########################################
	# Now the reduction
	# First iteration
	mov	$acc0, $t0
	shl	\$32, $acc0
	mulq	$t1
	shr	\$32, $t0
	add	$acc0, $acc1		# +=acc[0]<<96
	adc	$t0, $acc2
	adc	%rax, $acc3
	 mov	$acc1, %rax
	adc	\$0, %rdx

	##########################################
	# Second iteration
	mov	$acc1, $t0
	shl	\$32, $acc1
	mov	%rdx, $acc0
	mulq	$t1
	shr	\$32, $t0
	add	$acc1, $acc2
	adc	$t0, $acc3
	adc	%rax, $acc0
	 mov	$acc2, %rax
	adc	\$0, %rdx

	##########################################
	# Third iteration
	mov	$acc2, $t0
	shl	\$32, $acc2
	mov	%rdx, $acc1
	mulq	$t1
	shr	\$32, $t0
	add	$acc2, $acc3
	adc	$t0, $acc0
	adc	%rax, $acc1
	 mov	$acc3, %rax
	adc	\$0, %rdx

	###########################################
	# Last iteration
	mov	$acc3, $t0
	shl	\$32, $acc3
	mov	%rdx, $acc2
	mulq	$t1
	shr	\$32, $t0
	add	$acc3, $acc0
	adc	$t0, $acc1
	adc	%rax, $acc2
	adc	\$0, %rdx
	xor	$acc3, $acc3

	############################################
	# Add the rest of the acc
	add	$acc0, $acc4
	adc	$acc1, $acc5
	 mov	$acc4, $acc0
	adc	$acc2, $acc6
	adc	%rdx, $acc7
	 mov	$acc5, $acc1
	adc	\$0, $acc3

	sub	\$-1, $acc4		# .Lpoly[0]
	 mov	$acc6, $acc2
	sbb	$a_ptr, $acc5		# .Lpoly[1]
	sbb	\$0, $acc6		# .Lpoly[2]
	 mov	$acc7, $t0
	sbb	$t1, $acc7		# .Lpoly[3]
	sbb	\$0, $acc3

	cmovc	$acc0, $acc4
	cmovc	$acc1, $acc5
	mov	$acc4, 8*0($r_ptr)
	cmovc	$acc2, $acc6
	mov	$acc5, 8*1($r_ptr)
	cmovc	$t0, $acc7
	mov	$acc6, 8*2($r_ptr)
	mov	$acc7, 8*3($r_ptr)

	ret
.cfi_endproc
.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
___

if ($addx) {
$code.=<<___;
.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
.align	32
__ecp_nistz256_mul_montx:
.cfi_startproc
	########################################################################
	# Multiply by b[0]
	mulx	$acc1, $acc0, $acc1
	mulx	$acc2, $t0, $acc2
	mov	\$32, $poly1
	xor	$acc5, $acc5		# cf=0
	mulx	$acc3, $t1, $acc3
	mov	.Lpoly+8*3(%rip), $poly3
	adc	$t0, $acc1
	mulx	$acc4, $t0, $acc4
	 mov	$acc0, %rdx
	adc	$t1, $acc2
	 shlx	$poly1,$acc0,$t1
	adc	$t0, $acc3
	 shrx	$poly1,$acc0,$t0
	adc	\$0, $acc4

	########################################################################
	# First reduction step
	add	$t1, $acc1
	adc	$t0, $acc2

	mulx	$poly3, $t0, $t1
	 mov	8*1($b_ptr), %rdx
	adc	$t0, $acc3
	adc	$t1, $acc4
	adc	\$0, $acc5
	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0

	########################################################################
	# Multiply by b[1]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc1
	adox	$t1, $acc2

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc1, %rdx
	adcx	$t0, $acc4
	 shlx	$poly1, $acc1, $t0
	adox	$t1, $acc5
	 shrx	$poly1, $acc1, $t1

	adcx	$acc0, $acc5
	adox	$acc0, $acc0
	adc	\$0, $acc0

	########################################################################
	# Second reduction step
	add	$t0, $acc2
	adc	$t1, $acc3

	mulx	$poly3, $t0, $t1
	 mov	8*2($b_ptr), %rdx
	adc	$t0, $acc4
	adc	$t1, $acc5
	adc	\$0, $acc0
	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0

	########################################################################
	# Multiply by b[2]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc2
	adox	$t1, $acc3

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc2, %rdx
	adcx	$t0, $acc5
	 shlx	$poly1, $acc2, $t0
	adox	$t1, $acc0
	 shrx	$poly1, $acc2, $t1

	adcx	$acc1, $acc0
	adox	$acc1, $acc1
	adc	\$0, $acc1

	########################################################################
	# Third reduction step
	add	$t0, $acc3
	adc	$t1, $acc4

	mulx	$poly3, $t0, $t1
	 mov	8*3($b_ptr), %rdx
	adc	$t0, $acc5
	adc	$t1, $acc0
	adc	\$0, $acc1
	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0

	########################################################################
	# Multiply by b[3]
	mulx	8*0+128($a_ptr), $t0, $t1
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	8*1+128($a_ptr), $t0, $t1
	adcx	$t0, $acc4
	adox	$t1, $acc5

	mulx	8*2+128($a_ptr), $t0, $t1
	adcx	$t0, $acc5
	adox	$t1, $acc0

	mulx	8*3+128($a_ptr), $t0, $t1
	 mov	$acc3, %rdx
	adcx	$t0, $acc0
	 shlx	$poly1, $acc3, $t0
	adox	$t1, $acc1
	 shrx	$poly1, $acc3, $t1

	adcx	$acc2, $acc1
	adox	$acc2, $acc2
	adc	\$0, $acc2

	########################################################################
	# Fourth reduction step
	add	$t0, $acc4
	adc	$t1, $acc5

	mulx	$poly3, $t0, $t1
	 mov	$acc4, $t2
	mov	.Lpoly+8*1(%rip), $poly1
	adc	$t0, $acc0
	 mov	$acc5, $t3
	adc	$t1, $acc1
	adc	\$0, $acc2

	########################################################################
	# Branch-less conditional subtraction of P
	xor	%eax, %eax
	 mov	$acc0, $t0
	sbb	\$-1, $acc4		# .Lpoly[0]
	sbb	$poly1, $acc5		# .Lpoly[1]
	sbb	\$0, $acc0		# .Lpoly[2]
	 mov	$acc1, $t1
	sbb	$poly3, $acc1		# .Lpoly[3]
	sbb	\$0, $acc2

	cmovc	$t2, $acc4
	cmovc	$t3, $acc5
	mov	$acc4, 8*0($r_ptr)
	cmovc	$t0, $acc0
	mov	$acc5, 8*1($r_ptr)
	cmovc	$t1, $acc1
	mov	$acc0, 8*2($r_ptr)
	mov	$acc1, 8*3($r_ptr)

	ret
.cfi_endproc
.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx

.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
.align	32
__ecp_nistz256_sqr_montx:
.cfi_startproc
	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
	xor	%eax, %eax
	adc	$t0, $acc2
	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
	 mov	$acc6, %rdx
	adc	$t1, $acc3
	adc	\$0, $acc4
	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0

	#################################
	mulx	$acc7, $t0, $t1		# a[1]*a[2]
	adcx	$t0, $acc3
	adox	$t1, $acc4

	mulx	$acc0, $t0, $t1		# a[1]*a[3]
	 mov	$acc7, %rdx
	adcx	$t0, $acc4
	adox	$t1, $acc5
	adc	\$0, $acc5

	#################################
	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
	 mov	8*0+128($a_ptr), %rdx
	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
	 adcx	$acc1, $acc1		# acc1:6<<1
	adox	$t0, $acc5
	 adcx	$acc2, $acc2
	adox	$acc7, $acc6		# of=0

	mulx	%rdx, $acc0, $t1
	mov	8*1+128($a_ptr), %rdx
	 adcx	$acc3, $acc3
	adox	$t1, $acc1
	 adcx	$acc4, $acc4
	mulx	%rdx, $t0, $t4
	mov	8*2+128($a_ptr), %rdx
	 adcx	$acc5, $acc5
	adox	$t0, $acc2
	 adcx	$acc6, $acc6
	.byte	0x67
	mulx	%rdx, $t0, $t1
	mov	8*3+128($a_ptr), %rdx
	adox	$t4, $acc3
	 adcx	$acc7, $acc7
	adox	$t0, $acc4
	 mov	\$32, $a_ptr
	adox	$t1, $acc5
	.byte	0x67,0x67
	mulx	%rdx, $t0, $t4
	 mov	.Lpoly+8*3(%rip), %rdx
	adox	$t0, $acc6
	 shlx	$a_ptr, $acc0, $t0
	adox	$t4, $acc7
	 shrx	$a_ptr, $acc0, $t4
	mov	%rdx,$t1

	# reduction step 1
	add	$t0, $acc1
	adc	$t4, $acc2

	mulx	$acc0, $t0, $acc0
	adc	$t0, $acc3
	 shlx	$a_ptr, $acc1, $t0
	adc	\$0, $acc0
	 shrx	$a_ptr, $acc1, $t4

	# reduction step 2
	add	$t0, $acc2
	adc	$t4, $acc3

	mulx	$acc1, $t0, $acc1
	adc	$t0, $acc0
	 shlx	$a_ptr, $acc2, $t0
	adc	\$0, $acc1
	 shrx	$a_ptr, $acc2, $t4

	# reduction step 3
	add	$t0, $acc3
	adc	$t4, $acc0

	mulx	$acc2, $t0, $acc2
	adc	$t0, $acc1
	 shlx	$a_ptr, $acc3, $t0
	adc	\$0, $acc2
	 shrx	$a_ptr, $acc3, $t4

	# reduction step 4
	add	$t0, $acc0
	adc	$t4, $acc1

	mulx	$acc3, $t0, $acc3
	adc	$t0, $acc2
	adc	\$0, $acc3

	xor	$t3, $t3
	add	$acc0, $acc4		# accumulate upper half
	 mov	.Lpoly+8*1(%rip), $a_ptr
	adc	$acc1, $acc5
	 mov	$acc4, $acc0
	adc	$acc2, $acc6
	adc	$acc3, $acc7
	 mov	$acc5, $acc1
	adc	\$0, $t3

	sub	\$-1, $acc4		# .Lpoly[0]
	 mov	$acc6, $acc2
	sbb	$a_ptr, $acc5		# .Lpoly[1]
	sbb	\$0, $acc6		# .Lpoly[2]
	 mov	$acc7, $acc3
	sbb	$t1, $acc7		# .Lpoly[3]
	sbb	\$0, $t3

	cmovc	$acc0, $acc4
	cmovc	$acc1, $acc5
	mov	$acc4, 8*0($r_ptr)
	cmovc	$acc2, $acc6
	mov	$acc5, 8*1($r_ptr)
	cmovc	$acc3, $acc7
	mov	$acc6, 8*2($r_ptr)
	mov	$acc7, 8*3($r_ptr)

	ret
.cfi_endproc
.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
___
}