in lib/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl [569:2438]
sub $acc0, $acc2
sbb \$0, $acc0 # can't borrow
mulq 8*1(%r14)
add $t0, $acc1
adc \$0, %rdx
add %rax, $acc1
mov $t1, %rax
adc %rdx, $acc2
mov $t1, %rdx
adc \$0, $acc0 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc3
mov 8*1($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc0, $acc3
adc $t1, $acc4
adc \$0, $acc5
################################# * b[1]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc1
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov $acc1, $t0
imulq %r15, $acc1
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc4
adc \$0, %rdx
xor $acc0, $acc0
add %rax, $acc4
mov $acc1, %rax
adc %rdx, $acc5
adc \$0, $acc0
################################# Second reduction step
mulq 8*0(%r14)
mov $acc1, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc1, %rax
adc %rdx, $t0
sub $acc1, $acc3
sbb \$0, $acc1 # can't borrow
mulq 8*1(%r14)
add $t0, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $t1, %rax
adc %rdx, $acc3
mov $t1, %rdx
adc \$0, $acc1 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc4
mov 8*2($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc1, $acc4
adc $t1, $acc5
adc \$0, $acc0
################################## * b[2]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc2
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t0, %rax
adc \$0, %rdx
mov $acc2, $t0
imulq %r15, $acc2
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc5
adc \$0, %rdx
xor $acc1, $acc1
add %rax, $acc5
mov $acc2, %rax
adc %rdx, $acc0
adc \$0, $acc1
################################# Third reduction step
mulq 8*0(%r14)
mov $acc2, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc2, %rax
adc %rdx, $t0
sub $acc2, $acc4
sbb \$0, $acc2 # can't borrow
mulq 8*1(%r14)
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t1, %rax
adc %rdx, $acc4
mov $t1, %rdx
adc \$0, $acc2 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc5
mov 8*3($b_ptr), %rax
sbb %rdx, $t1 # can't borrow
add $acc2, $acc5
adc $t1, $acc0
adc \$0, $acc1
################################# * b[3]
mov %rax, $t0
mulq 8*0($a_ptr)
add %rax, $acc3
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*1($a_ptr)
add $t1, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq 8*2($a_ptr)
add $t1, $acc5
adc \$0, %rdx
add %rax, $acc5
mov $t0, %rax
adc \$0, %rdx
mov $acc3, $t0
imulq %r15, $acc3
mov %rdx, $t1
mulq 8*3($a_ptr)
add $t1, $acc0
adc \$0, %rdx
xor $acc2, $acc2
add %rax, $acc0
mov $acc3, %rax
adc %rdx, $acc1
adc \$0, $acc2
################################# Last reduction step
mulq 8*0(%r14)
mov $acc3, $t1
add %rax, $t0 # guaranteed to be zero
mov $acc3, %rax
adc %rdx, $t0
sub $acc3, $acc5
sbb \$0, $acc3 # can't borrow
mulq 8*1(%r14)
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t1, %rax
adc %rdx, $acc5
mov $t1, %rdx
adc \$0, $acc3 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc0
sbb %rdx, $t1 # can't borrow
add $acc3, $acc0
adc $t1, $acc1
adc \$0, $acc2
################################# Subtract ord
mov $acc4, $a_ptr
sub 8*0(%r14), $acc4
mov $acc5, $acc3
sbb 8*1(%r14), $acc5
mov $acc0, $t0
sbb 8*2(%r14), $acc0
mov $acc1, $t1
sbb 8*3(%r14), $acc1
sbb \$0, $acc2
cmovc $a_ptr, $acc4
cmovc $acc3, $acc5
cmovc $t0, $acc0
cmovc $t1, $acc1
mov $acc4, 8*0($r_ptr)
mov $acc5, 8*1($r_ptr)
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lord_mul_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
################################################################################
# void ecp_nistz256_ord_sqr_mont(
# uint64_t res[4],
# uint64_t a[4],
# int rep);
.globl ecp_nistz256_ord_sqr_mont
.type ecp_nistz256_ord_sqr_mont,\@function,3
.align 32
ecp_nistz256_ord_sqr_mont:
.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
cmp \$0x80100, %ecx
je .Lecp_nistz256_ord_sqr_montx
___
$code.=<<___;
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lord_sqr_body:
mov 8*0($a_ptr), $acc0
mov 8*1($a_ptr), %rax
mov 8*2($a_ptr), $acc6
mov 8*3($a_ptr), $acc7
lea .Lord(%rip), $a_ptr # pointer to modulus
mov $b_org, $b_ptr
jmp .Loop_ord_sqr
.align 32
.Loop_ord_sqr:
################################# a[1:] * a[0]
mov %rax, $t1 # put aside a[1]
mul $acc0 # a[1] * a[0]
mov %rax, $acc1
movq $t1, %xmm1 # offload a[1]
mov $acc6, %rax
mov %rdx, $acc2
mul $acc0 # a[2] * a[0]
add %rax, $acc2
mov $acc7, %rax
movq $acc6, %xmm2 # offload a[2]
adc \$0, %rdx
mov %rdx, $acc3
mul $acc0 # a[3] * a[0]
add %rax, $acc3
mov $acc7, %rax
movq $acc7, %xmm3 # offload a[3]
adc \$0, %rdx
mov %rdx, $acc4
################################# a[3] * a[2]
mul $acc6 # a[3] * a[2]
mov %rax, $acc5
mov $acc6, %rax
mov %rdx, $acc6
################################# a[2:] * a[1]
mul $t1 # a[2] * a[1]
add %rax, $acc3
mov $acc7, %rax
adc \$0, %rdx
mov %rdx, $acc7
mul $t1 # a[3] * a[1]
add %rax, $acc4
adc \$0, %rdx
add $acc7, $acc4
adc %rdx, $acc5
adc \$0, $acc6 # can't overflow
################################# *2
xor $acc7, $acc7
mov $acc0, %rax
add $acc1, $acc1
adc $acc2, $acc2
adc $acc3, $acc3
adc $acc4, $acc4
adc $acc5, $acc5
adc $acc6, $acc6
adc \$0, $acc7
################################# Missing products
mul %rax # a[0] * a[0]
mov %rax, $acc0
movq %xmm1, %rax
mov %rdx, $t1
mul %rax # a[1] * a[1]
add $t1, $acc1
adc %rax, $acc2
movq %xmm2, %rax
adc \$0, %rdx
mov %rdx, $t1
mul %rax # a[2] * a[2]
add $t1, $acc3
adc %rax, $acc4
movq %xmm3, %rax
adc \$0, %rdx
mov %rdx, $t1
mov $acc0, $t0
imulq 8*4($a_ptr), $acc0 # *= .LordK
mul %rax # a[3] * a[3]
add $t1, $acc5
adc %rax, $acc6
mov 8*0($a_ptr), %rax # modulus[0]
adc %rdx, $acc7 # can't overflow
################################# First reduction step
mul $acc0
mov $acc0, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax # modulus[1]
adc %rdx, $t0
sub $acc0, $acc2
sbb \$0, $t1 # can't borrow
mul $acc0
add $t0, $acc1
adc \$0, %rdx
add %rax, $acc1
mov $acc0, %rax
adc %rdx, $acc2
mov $acc0, %rdx
adc \$0, $t1 # can't overflow
mov $acc1, $t0
imulq 8*4($a_ptr), $acc1 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc3
mov 8*0($a_ptr), %rax
sbb %rdx, $acc0 # can't borrow
add $t1, $acc3
adc \$0, $acc0 # can't overflow
################################# Second reduction step
mul $acc1
mov $acc1, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc1, $acc3
sbb \$0, $t1 # can't borrow
mul $acc1
add $t0, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $acc1, %rax
adc %rdx, $acc3
mov $acc1, %rdx
adc \$0, $t1 # can't overflow
mov $acc2, $t0
imulq 8*4($a_ptr), $acc2 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc0
mov 8*0($a_ptr), %rax
sbb %rdx, $acc1 # can't borrow
add $t1, $acc0
adc \$0, $acc1 # can't overflow
################################# Third reduction step
mul $acc2
mov $acc2, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc2, $acc0
sbb \$0, $t1 # can't borrow
mul $acc2
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $acc2, %rax
adc %rdx, $acc0
mov $acc2, %rdx
adc \$0, $t1 # can't overflow
mov $acc3, $t0
imulq 8*4($a_ptr), $acc3 # *= .LordK
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc1
mov 8*0($a_ptr), %rax
sbb %rdx, $acc2 # can't borrow
add $t1, $acc1
adc \$0, $acc2 # can't overflow
################################# Last reduction step
mul $acc3
mov $acc3, $t1
add %rax, $t0 # guaranteed to be zero
mov 8*1($a_ptr), %rax
adc %rdx, $t0
sub $acc3, $acc1
sbb \$0, $t1 # can't borrow
mul $acc3
add $t0, $acc0
adc \$0, %rdx
add %rax, $acc0
mov $acc3, %rax
adc %rdx, $acc1
mov $acc3, %rdx
adc \$0, $t1 # can't overflow
shl \$32, %rax
shr \$32, %rdx
sub %rax, $acc2
sbb %rdx, $acc3 # can't borrow
add $t1, $acc2
adc \$0, $acc3 # can't overflow
################################# Add bits [511:256] of the sqr result
xor %rdx, %rdx
add $acc4, $acc0
adc $acc5, $acc1
mov $acc0, $acc4
adc $acc6, $acc2
adc $acc7, $acc3
mov $acc1, %rax
adc \$0, %rdx
################################# Compare to modulus
sub 8*0($a_ptr), $acc0
mov $acc2, $acc6
sbb 8*1($a_ptr), $acc1
sbb 8*2($a_ptr), $acc2
mov $acc3, $acc7
sbb 8*3($a_ptr), $acc3
sbb \$0, %rdx
cmovc $acc4, $acc0
cmovnc $acc1, %rax
cmovnc $acc2, $acc6
cmovnc $acc3, $acc7
dec $b_ptr
jnz .Loop_ord_sqr
mov $acc0, 8*0($r_ptr)
mov %rax, 8*1($r_ptr)
pxor %xmm1, %xmm1
mov $acc6, 8*2($r_ptr)
pxor %xmm2, %xmm2
mov $acc7, 8*3($r_ptr)
pxor %xmm3, %xmm3
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lord_sqr_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
___
$code.=<<___ if ($addx);
################################################################################
.type ecp_nistz256_ord_mul_montx,\@function,3
.align 32
ecp_nistz256_ord_mul_montx:
.cfi_startproc
.Lecp_nistz256_ord_mul_montx:
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lord_mulx_body:
mov $b_org, $b_ptr
mov 8*0($b_org), %rdx
mov 8*0($a_ptr), $acc1
mov 8*1($a_ptr), $acc2
mov 8*2($a_ptr), $acc3
mov 8*3($a_ptr), $acc4
lea -128($a_ptr), $a_ptr # control u-op density
lea .Lord-128(%rip), %r14
mov .LordK(%rip), %r15
################################# Multiply by b[0]
mulx $acc1, $acc0, $acc1
mulx $acc2, $t0, $acc2
mulx $acc3, $t1, $acc3
add $t0, $acc1
mulx $acc4, $t0, $acc4
mov $acc0, %rdx
mulx %r15, %rdx, %rax
adc $t1, $acc2
adc $t0, $acc3
adc \$0, $acc4
################################# reduction
xor $acc5, $acc5 # $acc5=0, cf=0, of=0
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc0 # guaranteed to be zero
adox $t1, $acc1
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*3+128(%r14), $t0, $t1
mov 8*1($b_ptr), %rdx
adcx $t0, $acc3
adox $t1, $acc4
adcx $acc0, $acc4
adox $acc0, $acc5
adc \$0, $acc5 # cf=0, of=0
################################# Multiply by b[1]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc1, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc4
adox $t1, $acc5
adcx $acc0, $acc5
adox $acc0, $acc0
adc \$0, $acc0 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc1 # guaranteed to be zero
adox $t1, $acc2
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*3+128(%r14), $t0, $t1
mov 8*2($b_ptr), %rdx
adcx $t0, $acc4
adox $t1, $acc5
adcx $acc1, $acc5
adox $acc1, $acc0
adc \$0, $acc0 # cf=0, of=0
################################# Multiply by b[2]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc2, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc5
adox $t1, $acc0
adcx $acc1, $acc0
adox $acc1, $acc1
adc \$0, $acc1 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc2 # guaranteed to be zero
adox $t1, $acc3
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*3+128(%r14), $t0, $t1
mov 8*3($b_ptr), %rdx
adcx $t0, $acc5
adox $t1, $acc0
adcx $acc2, $acc0
adox $acc2, $acc1
adc \$0, $acc1 # cf=0, of=0
################################# Multiply by b[3]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc5
adox $t1, $acc0
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc3, %rdx
mulx %r15, %rdx, %rax
adcx $t0, $acc0
adox $t1, $acc1
adcx $acc2, $acc1
adox $acc2, $acc2
adc \$0, $acc2 # cf=0, of=0
################################# reduction
mulx 8*0+128(%r14), $t0, $t1
adcx $t0, $acc3 # guaranteed to be zero
adox $t1, $acc4
mulx 8*1+128(%r14), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*2+128(%r14), $t0, $t1
adcx $t0, $acc5
adox $t1, $acc0
mulx 8*3+128(%r14), $t0, $t1
lea 128(%r14),%r14
mov $acc4, $t2
adcx $t0, $acc0
adox $t1, $acc1
mov $acc5, $t3
adcx $acc3, $acc1
adox $acc3, $acc2
adc \$0, $acc2
#################################
# Branch-less conditional subtraction of P
mov $acc0, $t0
sub 8*0(%r14), $acc4
sbb 8*1(%r14), $acc5
sbb 8*2(%r14), $acc0
mov $acc1, $t1
sbb 8*3(%r14), $acc1
sbb \$0, $acc2
cmovc $t2, $acc4
cmovc $t3, $acc5
cmovc $t0, $acc0
cmovc $t1, $acc1
mov $acc4, 8*0($r_ptr)
mov $acc5, 8*1($r_ptr)
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lord_mulx_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
.type ecp_nistz256_ord_sqr_montx,\@function,3
.align 32
ecp_nistz256_ord_sqr_montx:
.cfi_startproc
.Lecp_nistz256_ord_sqr_montx:
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lord_sqrx_body:
mov $b_org, $b_ptr
mov 8*0($a_ptr), %rdx
mov 8*1($a_ptr), $acc6
mov 8*2($a_ptr), $acc7
mov 8*3($a_ptr), $acc0
lea .Lord(%rip), $a_ptr
jmp .Loop_ord_sqrx
.align 32
.Loop_ord_sqrx:
mulx $acc6, $acc1, $acc2 # a[0]*a[1]
mulx $acc7, $t0, $acc3 # a[0]*a[2]
mov %rdx, %rax # offload a[0]
movq $acc6, %xmm1 # offload a[1]
mulx $acc0, $t1, $acc4 # a[0]*a[3]
mov $acc6, %rdx
add $t0, $acc2
movq $acc7, %xmm2 # offload a[2]
adc $t1, $acc3
adc \$0, $acc4
xor $acc5, $acc5 # $acc5=0,cf=0,of=0
#################################
mulx $acc7, $t0, $t1 # a[1]*a[2]
adcx $t0, $acc3
adox $t1, $acc4
mulx $acc0, $t0, $t1 # a[1]*a[3]
mov $acc7, %rdx
adcx $t0, $acc4
adox $t1, $acc5
adc \$0, $acc5
#################################
mulx $acc0, $t0, $acc6 # a[2]*a[3]
mov %rax, %rdx
movq $acc0, %xmm3 # offload a[3]
xor $acc7, $acc7 # $acc7=0,cf=0,of=0
adcx $acc1, $acc1 # acc1:6<<1
adox $t0, $acc5
adcx $acc2, $acc2
adox $acc7, $acc6 # of=0
################################# a[i]*a[i]
mulx %rdx, $acc0, $t1
movq %xmm1, %rdx
adcx $acc3, $acc3
adox $t1, $acc1
adcx $acc4, $acc4
mulx %rdx, $t0, $t4
movq %xmm2, %rdx
adcx $acc5, $acc5
adox $t0, $acc2
adcx $acc6, $acc6
mulx %rdx, $t0, $t1
.byte 0x67
movq %xmm3, %rdx
adox $t4, $acc3
adcx $acc7, $acc7
adox $t0, $acc4
adox $t1, $acc5
mulx %rdx, $t0, $t4
adox $t0, $acc6
adox $t4, $acc7
################################# reduction
mov $acc0, %rdx
mulx 8*4($a_ptr), %rdx, $t0
xor %rax, %rax # cf=0, of=0
mulx 8*0($a_ptr), $t0, $t1
adcx $t0, $acc0 # guaranteed to be zero
adox $t1, $acc1
mulx 8*1($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*2($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*3($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc0 # of=0
adcx %rax, $acc0 # cf=0
#################################
mov $acc1, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adox $t0, $acc1 # guaranteed to be zero
adcx $t1, $acc2
mulx 8*1($a_ptr), $t0, $t1
adox $t0, $acc2
adcx $t1, $acc3
mulx 8*2($a_ptr), $t0, $t1
adox $t0, $acc3
adcx $t1, $acc0
mulx 8*3($a_ptr), $t0, $t1
adox $t0, $acc0
adcx $t1, $acc1 # cf=0
adox %rax, $acc1 # of=0
#################################
mov $acc2, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adcx $t0, $acc2 # guaranteed to be zero
adox $t1, $acc3
mulx 8*1($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc0
mulx 8*2($a_ptr), $t0, $t1
adcx $t0, $acc0
adox $t1, $acc1
mulx 8*3($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2 # of=0
adcx %rax, $acc2 # cf=0
#################################
mov $acc3, %rdx
mulx 8*4($a_ptr), %rdx, $t0
mulx 8*0($a_ptr), $t0, $t1
adox $t0, $acc3 # guaranteed to be zero
adcx $t1, $acc0
mulx 8*1($a_ptr), $t0, $t1
adox $t0, $acc0
adcx $t1, $acc1
mulx 8*2($a_ptr), $t0, $t1
adox $t0, $acc1
adcx $t1, $acc2
mulx 8*3($a_ptr), $t0, $t1
adox $t0, $acc2
adcx $t1, $acc3
adox %rax, $acc3
################################# accumulate upper half
add $acc0, $acc4 # add $acc4, $acc0
adc $acc5, $acc1
mov $acc4, %rdx
adc $acc6, $acc2
adc $acc7, $acc3
mov $acc1, $acc6
adc \$0, %rax
################################# compare to modulus
sub 8*0($a_ptr), $acc4
mov $acc2, $acc7
sbb 8*1($a_ptr), $acc1
sbb 8*2($a_ptr), $acc2
mov $acc3, $acc0
sbb 8*3($a_ptr), $acc3
sbb \$0, %rax
cmovnc $acc4, %rdx
cmovnc $acc1, $acc6
cmovnc $acc2, $acc7
cmovnc $acc3, $acc0
dec $b_ptr
jnz .Loop_ord_sqrx
mov %rdx, 8*0($r_ptr)
mov $acc6, 8*1($r_ptr)
pxor %xmm1, %xmm1
mov $acc7, 8*2($r_ptr)
pxor %xmm2, %xmm2
mov $acc0, 8*3($r_ptr)
pxor %xmm3, %xmm3
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lord_sqrx_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
___
$code.=<<___;
################################################################################
# void ecp_nistz256_to_mont(
# uint64_t res[4],
# uint64_t in[4]);
.globl ecp_nistz256_to_mont
.type ecp_nistz256_to_mont,\@function,2
.align 32
ecp_nistz256_to_mont:
.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
___
$code.=<<___;
lea .LRR(%rip), $b_org
jmp .Lmul_mont
.cfi_endproc
.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
################################################################################
# void ecp_nistz256_mul_mont(
# uint64_t res[4],
# uint64_t a[4],
# uint64_t b[4]);
.globl ecp_nistz256_mul_mont
.type ecp_nistz256_mul_mont,\@function,3
.align 32
ecp_nistz256_mul_mont:
.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
___
$code.=<<___;
.Lmul_mont:
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lmul_body:
___
$code.=<<___ if ($addx);
cmp \$0x80100, %ecx
je .Lmul_montx
___
$code.=<<___;
mov $b_org, $b_ptr
mov 8*0($b_org), %rax
mov 8*0($a_ptr), $acc1
mov 8*1($a_ptr), $acc2
mov 8*2($a_ptr), $acc3
mov 8*3($a_ptr), $acc4
call __ecp_nistz256_mul_montq
___
$code.=<<___ if ($addx);
jmp .Lmul_mont_done
.align 32
.Lmul_montx:
mov $b_org, $b_ptr
mov 8*0($b_org), %rdx
mov 8*0($a_ptr), $acc1
mov 8*1($a_ptr), $acc2
mov 8*2($a_ptr), $acc3
mov 8*3($a_ptr), $acc4
lea -128($a_ptr), $a_ptr # control u-op density
call __ecp_nistz256_mul_montx
___
$code.=<<___;
.Lmul_mont_done:
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lmul_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
.type __ecp_nistz256_mul_montq,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_montq:
.cfi_startproc
########################################################################
# Multiply a by b[0]
mov %rax, $t1
mulq $acc1
mov .Lpoly+8*1(%rip),$poly1
mov %rax, $acc0
mov $t1, %rax
mov %rdx, $acc1
mulq $acc2
mov .Lpoly+8*3(%rip),$poly3
add %rax, $acc1
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $acc2
mulq $acc3
add %rax, $acc2
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $acc3
mulq $acc4
add %rax, $acc3
mov $acc0, %rax
adc \$0, %rdx
xor $acc5, $acc5
mov %rdx, $acc4
########################################################################
# First reduction step
# Basically now we want to multiply acc[0] by p256,
# and add the result to the acc.
# Due to the special form of p256 we do some optimizations
#
# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
# then we add acc[0] and get acc[0] x 2^96
mov $acc0, $t1
shl \$32, $acc0
mulq $poly3
shr \$32, $t1
add $acc0, $acc1 # +=acc[0]<<96
adc $t1, $acc2
adc %rax, $acc3
mov 8*1($b_ptr), %rax
adc %rdx, $acc4
adc \$0, $acc5
xor $acc0, $acc0
########################################################################
# Multiply by b[1]
mov %rax, $t1
mulq 8*0($a_ptr)
add %rax, $acc1
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*1($a_ptr)
add $t0, $acc2
adc \$0, %rdx
add %rax, $acc2
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*2($a_ptr)
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*3($a_ptr)
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $acc1, %rax
adc %rdx, $acc5
adc \$0, $acc0
########################################################################
# Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
shr \$32, $t1
add $acc1, $acc2
adc $t1, $acc3
adc %rax, $acc4
mov 8*2($b_ptr), %rax
adc %rdx, $acc5
adc \$0, $acc0
xor $acc1, $acc1
########################################################################
# Multiply by b[2]
mov %rax, $t1
mulq 8*0($a_ptr)
add %rax, $acc2
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*1($a_ptr)
add $t0, $acc3
adc \$0, %rdx
add %rax, $acc3
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*2($a_ptr)
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*3($a_ptr)
add $t0, $acc5
adc \$0, %rdx
add %rax, $acc5
mov $acc2, %rax
adc %rdx, $acc0
adc \$0, $acc1
########################################################################
# Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
shr \$32, $t1
add $acc2, $acc3
adc $t1, $acc4
adc %rax, $acc5
mov 8*3($b_ptr), %rax
adc %rdx, $acc0
adc \$0, $acc1
xor $acc2, $acc2
########################################################################
# Multiply by b[3]
mov %rax, $t1
mulq 8*0($a_ptr)
add %rax, $acc3
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*1($a_ptr)
add $t0, $acc4
adc \$0, %rdx
add %rax, $acc4
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*2($a_ptr)
add $t0, $acc5
adc \$0, %rdx
add %rax, $acc5
mov $t1, %rax
adc \$0, %rdx
mov %rdx, $t0
mulq 8*3($a_ptr)
add $t0, $acc0
adc \$0, %rdx
add %rax, $acc0
mov $acc3, %rax
adc %rdx, $acc1
adc \$0, $acc2
########################################################################
# Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
shr \$32, $t1
add $acc3, $acc4
adc $t1, $acc5
mov $acc4, $t0
adc %rax, $acc0
adc %rdx, $acc1
mov $acc5, $t1
adc \$0, $acc2
########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
sbb $poly1, $acc5 # .Lpoly[1]
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t3
sbb $poly3, $acc1 # .Lpoly[3]
sbb \$0, $acc2
cmovc $t0, $acc4
cmovc $t1, $acc5
mov $acc4, 8*0($r_ptr)
cmovc $t2, $acc0
mov $acc5, 8*1($r_ptr)
cmovc $t3, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
ret
.cfi_endproc
.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
################################################################################
# void ecp_nistz256_sqr_mont(
# uint64_t res[4],
# uint64_t a[4]);
# we optimize the square according to S.Gueron and V.Krasnov,
# "Speeding up Big-Number Squaring"
.globl ecp_nistz256_sqr_mont
.type ecp_nistz256_sqr_mont,\@function,2
.align 32
ecp_nistz256_sqr_mont:
.cfi_startproc
___
$code.=<<___ if ($addx);
mov \$0x80100, %ecx
and OPENSSL_ia32cap_P+8(%rip), %ecx
___
$code.=<<___;
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lsqr_body:
___
$code.=<<___ if ($addx);
cmp \$0x80100, %ecx
je .Lsqr_montx
___
$code.=<<___;
mov 8*0($a_ptr), %rax
mov 8*1($a_ptr), $acc6
mov 8*2($a_ptr), $acc7
mov 8*3($a_ptr), $acc0
call __ecp_nistz256_sqr_montq
___
$code.=<<___ if ($addx);
jmp .Lsqr_mont_done
.align 32
.Lsqr_montx:
mov 8*0($a_ptr), %rdx
mov 8*1($a_ptr), $acc6
mov 8*2($a_ptr), $acc7
mov 8*3($a_ptr), $acc0
lea -128($a_ptr), $a_ptr # control u-op density
call __ecp_nistz256_sqr_montx
___
$code.=<<___;
.Lsqr_mont_done:
mov 0(%rsp),%r15
.cfi_restore %r15
mov 8(%rsp),%r14
.cfi_restore %r14
mov 16(%rsp),%r13
.cfi_restore %r13
mov 24(%rsp),%r12
.cfi_restore %r12
mov 32(%rsp),%rbx
.cfi_restore %rbx
mov 40(%rsp),%rbp
.cfi_restore %rbp
lea 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lsqr_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
.align 32
__ecp_nistz256_sqr_montq:
.cfi_startproc
mov %rax, $acc5
mulq $acc6 # a[1]*a[0]
mov %rax, $acc1
mov $acc7, %rax
mov %rdx, $acc2
mulq $acc5 # a[0]*a[2]
add %rax, $acc2
mov $acc0, %rax
adc \$0, %rdx
mov %rdx, $acc3
mulq $acc5 # a[0]*a[3]
add %rax, $acc3
mov $acc7, %rax
adc \$0, %rdx
mov %rdx, $acc4
#################################
mulq $acc6 # a[1]*a[2]
add %rax, $acc3
mov $acc0, %rax
adc \$0, %rdx
mov %rdx, $t1
mulq $acc6 # a[1]*a[3]
add %rax, $acc4
mov $acc0, %rax
adc \$0, %rdx
add $t1, $acc4
mov %rdx, $acc5
adc \$0, $acc5
#################################
mulq $acc7 # a[2]*a[3]
xor $acc7, $acc7
add %rax, $acc5
mov 8*0($a_ptr), %rax
mov %rdx, $acc6
adc \$0, $acc6
add $acc1, $acc1 # acc1:6<<1
adc $acc2, $acc2
adc $acc3, $acc3
adc $acc4, $acc4
adc $acc5, $acc5
adc $acc6, $acc6
adc \$0, $acc7
mulq %rax
mov %rax, $acc0
mov 8*1($a_ptr), %rax
mov %rdx, $t0
mulq %rax
add $t0, $acc1
adc %rax, $acc2
mov 8*2($a_ptr), %rax
adc \$0, %rdx
mov %rdx, $t0
mulq %rax
add $t0, $acc3
adc %rax, $acc4
mov 8*3($a_ptr), %rax
adc \$0, %rdx
mov %rdx, $t0
mulq %rax
add $t0, $acc5
adc %rax, $acc6
mov $acc0, %rax
adc %rdx, $acc7
mov .Lpoly+8*1(%rip), $a_ptr
mov .Lpoly+8*3(%rip), $t1
##########################################
# Now the reduction
# First iteration
mov $acc0, $t0
shl \$32, $acc0
mulq $t1
shr \$32, $t0
add $acc0, $acc1 # +=acc[0]<<96
adc $t0, $acc2
adc %rax, $acc3
mov $acc1, %rax
adc \$0, %rdx
##########################################
# Second iteration
mov $acc1, $t0
shl \$32, $acc1
mov %rdx, $acc0
mulq $t1
shr \$32, $t0
add $acc1, $acc2
adc $t0, $acc3
adc %rax, $acc0
mov $acc2, %rax
adc \$0, %rdx
##########################################
# Third iteration
mov $acc2, $t0
shl \$32, $acc2
mov %rdx, $acc1
mulq $t1
shr \$32, $t0
add $acc2, $acc3
adc $t0, $acc0
adc %rax, $acc1
mov $acc3, %rax
adc \$0, %rdx
###########################################
# Last iteration
mov $acc3, $t0
shl \$32, $acc3
mov %rdx, $acc2
mulq $t1
shr \$32, $t0
add $acc3, $acc0
adc $t0, $acc1
adc %rax, $acc2
adc \$0, %rdx
xor $acc3, $acc3
############################################
# Add the rest of the acc
add $acc0, $acc4
adc $acc1, $acc5
mov $acc4, $acc0
adc $acc2, $acc6
adc %rdx, $acc7
mov $acc5, $acc1
adc \$0, $acc3
sub \$-1, $acc4 # .Lpoly[0]
mov $acc6, $acc2
sbb $a_ptr, $acc5 # .Lpoly[1]
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $t0
sbb $t1, $acc7 # .Lpoly[3]
sbb \$0, $acc3
cmovc $acc0, $acc4
cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
cmovc $t0, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
ret
.cfi_endproc
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
___
if ($addx) {
$code.=<<___;
.type __ecp_nistz256_mul_montx,\@abi-omnipotent
.align 32
__ecp_nistz256_mul_montx:
.cfi_startproc
########################################################################
# Multiply by b[0]
mulx $acc1, $acc0, $acc1
mulx $acc2, $t0, $acc2
mov \$32, $poly1
xor $acc5, $acc5 # cf=0
mulx $acc3, $t1, $acc3
mov .Lpoly+8*3(%rip), $poly3
adc $t0, $acc1
mulx $acc4, $t0, $acc4
mov $acc0, %rdx
adc $t1, $acc2
shlx $poly1,$acc0,$t1
adc $t0, $acc3
shrx $poly1,$acc0,$t0
adc \$0, $acc4
########################################################################
# First reduction step
add $t1, $acc1
adc $t0, $acc2
mulx $poly3, $t0, $t1
mov 8*1($b_ptr), %rdx
adc $t0, $acc3
adc $t1, $acc4
adc \$0, $acc5
xor $acc0, $acc0 # $acc0=0,cf=0,of=0
########################################################################
# Multiply by b[1]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc1
adox $t1, $acc2
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc1, %rdx
adcx $t0, $acc4
shlx $poly1, $acc1, $t0
adox $t1, $acc5
shrx $poly1, $acc1, $t1
adcx $acc0, $acc5
adox $acc0, $acc0
adc \$0, $acc0
########################################################################
# Second reduction step
add $t0, $acc2
adc $t1, $acc3
mulx $poly3, $t0, $t1
mov 8*2($b_ptr), %rdx
adc $t0, $acc4
adc $t1, $acc5
adc \$0, $acc0
xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
########################################################################
# Multiply by b[2]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc2
adox $t1, $acc3
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc2, %rdx
adcx $t0, $acc5
shlx $poly1, $acc2, $t0
adox $t1, $acc0
shrx $poly1, $acc2, $t1
adcx $acc1, $acc0
adox $acc1, $acc1
adc \$0, $acc1
########################################################################
# Third reduction step
add $t0, $acc3
adc $t1, $acc4
mulx $poly3, $t0, $t1
mov 8*3($b_ptr), %rdx
adc $t0, $acc5
adc $t1, $acc0
adc \$0, $acc1
xor $acc2, $acc2 # $acc2=0,cf=0,of=0
########################################################################
# Multiply by b[3]
mulx 8*0+128($a_ptr), $t0, $t1
adcx $t0, $acc3
adox $t1, $acc4
mulx 8*1+128($a_ptr), $t0, $t1
adcx $t0, $acc4
adox $t1, $acc5
mulx 8*2+128($a_ptr), $t0, $t1
adcx $t0, $acc5
adox $t1, $acc0
mulx 8*3+128($a_ptr), $t0, $t1
mov $acc3, %rdx
adcx $t0, $acc0
shlx $poly1, $acc3, $t0
adox $t1, $acc1
shrx $poly1, $acc3, $t1
adcx $acc2, $acc1
adox $acc2, $acc2
adc \$0, $acc2
########################################################################
# Fourth reduction step
add $t0, $acc4
adc $t1, $acc5
mulx $poly3, $t0, $t1
mov $acc4, $t2
mov .Lpoly+8*1(%rip), $poly1
adc $t0, $acc0
mov $acc5, $t3
adc $t1, $acc1
adc \$0, $acc2
########################################################################
# Branch-less conditional subtraction of P
xor %eax, %eax
mov $acc0, $t0
sbb \$-1, $acc4 # .Lpoly[0]
sbb $poly1, $acc5 # .Lpoly[1]
sbb \$0, $acc0 # .Lpoly[2]
mov $acc1, $t1
sbb $poly3, $acc1 # .Lpoly[3]
sbb \$0, $acc2
cmovc $t2, $acc4
cmovc $t3, $acc5
mov $acc4, 8*0($r_ptr)
cmovc $t0, $acc0
mov $acc5, 8*1($r_ptr)
cmovc $t1, $acc1
mov $acc0, 8*2($r_ptr)
mov $acc1, 8*3($r_ptr)
ret
.cfi_endproc
.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
.type __ecp_nistz256_sqr_montx,\@abi-omnipotent
.align 32
__ecp_nistz256_sqr_montx:
.cfi_startproc
mulx $acc6, $acc1, $acc2 # a[0]*a[1]
mulx $acc7, $t0, $acc3 # a[0]*a[2]
xor %eax, %eax
adc $t0, $acc2
mulx $acc0, $t1, $acc4 # a[0]*a[3]
mov $acc6, %rdx
adc $t1, $acc3
adc \$0, $acc4
xor $acc5, $acc5 # $acc5=0,cf=0,of=0
#################################
mulx $acc7, $t0, $t1 # a[1]*a[2]
adcx $t0, $acc3
adox $t1, $acc4
mulx $acc0, $t0, $t1 # a[1]*a[3]
mov $acc7, %rdx
adcx $t0, $acc4
adox $t1, $acc5
adc \$0, $acc5
#################################
mulx $acc0, $t0, $acc6 # a[2]*a[3]
mov 8*0+128($a_ptr), %rdx
xor $acc7, $acc7 # $acc7=0,cf=0,of=0
adcx $acc1, $acc1 # acc1:6<<1
adox $t0, $acc5
adcx $acc2, $acc2
adox $acc7, $acc6 # of=0
mulx %rdx, $acc0, $t1
mov 8*1+128($a_ptr), %rdx
adcx $acc3, $acc3
adox $t1, $acc1
adcx $acc4, $acc4
mulx %rdx, $t0, $t4
mov 8*2+128($a_ptr), %rdx
adcx $acc5, $acc5
adox $t0, $acc2
adcx $acc6, $acc6
.byte 0x67
mulx %rdx, $t0, $t1
mov 8*3+128($a_ptr), %rdx
adox $t4, $acc3
adcx $acc7, $acc7
adox $t0, $acc4
mov \$32, $a_ptr
adox $t1, $acc5
.byte 0x67,0x67
mulx %rdx, $t0, $t4
mov .Lpoly+8*3(%rip), %rdx
adox $t0, $acc6
shlx $a_ptr, $acc0, $t0
adox $t4, $acc7
shrx $a_ptr, $acc0, $t4
mov %rdx,$t1
# reduction step 1
add $t0, $acc1
adc $t4, $acc2
mulx $acc0, $t0, $acc0
adc $t0, $acc3
shlx $a_ptr, $acc1, $t0
adc \$0, $acc0
shrx $a_ptr, $acc1, $t4
# reduction step 2
add $t0, $acc2
adc $t4, $acc3
mulx $acc1, $t0, $acc1
adc $t0, $acc0
shlx $a_ptr, $acc2, $t0
adc \$0, $acc1
shrx $a_ptr, $acc2, $t4
# reduction step 3
add $t0, $acc3
adc $t4, $acc0
mulx $acc2, $t0, $acc2
adc $t0, $acc1
shlx $a_ptr, $acc3, $t0
adc \$0, $acc2
shrx $a_ptr, $acc3, $t4
# reduction step 4
add $t0, $acc0
adc $t4, $acc1
mulx $acc3, $t0, $acc3
adc $t0, $acc2
adc \$0, $acc3
xor $t3, $t3
add $acc0, $acc4 # accumulate upper half
mov .Lpoly+8*1(%rip), $a_ptr
adc $acc1, $acc5
mov $acc4, $acc0
adc $acc2, $acc6
adc $acc3, $acc7
mov $acc5, $acc1
adc \$0, $t3
sub \$-1, $acc4 # .Lpoly[0]
mov $acc6, $acc2
sbb $a_ptr, $acc5 # .Lpoly[1]
sbb \$0, $acc6 # .Lpoly[2]
mov $acc7, $acc3
sbb $t1, $acc7 # .Lpoly[3]
sbb \$0, $t3
cmovc $acc0, $acc4
cmovc $acc1, $acc5
mov $acc4, 8*0($r_ptr)
cmovc $acc2, $acc6
mov $acc5, 8*1($r_ptr)
cmovc $acc3, $acc7
mov $acc6, 8*2($r_ptr)
mov $acc7, 8*3($r_ptr)
ret
.cfi_endproc
.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
___
}