in src/crypto/fipsmodule/bn/asm/x86_64-mont.pl [143:805]
sub %r10,%r11
and \$-4096,%r11
lea (%r10,%r11),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
jmp .Lmul_page_walk_done
.align 16
.Lmul_page_walk:
lea -4096(%rsp),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul_page_walk
.Lmul_page_walk_done:
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
.Lmul_body:
mov $bp,%r12 # reassign $bp
___
$bp="%r12";
$code.=<<___;
mov ($n0),$n0 # pull n0[0] value
mov ($bp),$m0 # m0=bp[0]
mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$lo0
mov ($np),%rax
imulq $lo0,$m1 # "tp[0]"*n0
mov %rdx,$hi0
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j # j++
jmp .L1st_enter
.align 16
.L1st:
add %rax,$hi1
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
mov $lo0,$hi0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
.L1st_enter:
mulq $m0 # ap[j]*bp[0]
add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
lea 1($j),$j # j++
mov %rdx,$lo0
mulq $m1 # np[j]*m1
cmp $num,$j
jne .L1st
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
mov $lo0,$hi0
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
jmp .Louter
.align 16
.Louter:
mov ($bp,$i,8),$m0 # m0=bp[i]
xor $j,$j # j=0
mov $n0,$m1
mov (%rsp),$lo0
mulq $m0 # ap[0]*bp[i]
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
imulq $lo0,$m1 # tp[0]*n0
mov %rdx,$hi0
mulq $m1 # np[0]*m1
add %rax,$lo0 # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov 8(%rsp),$lo0 # tp[1]
mov %rdx,$hi1
lea 1($j),$j # j++
jmp .Linner_enter
.align 16
.Linner:
add %rax,$hi1
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
.Linner_enter:
mulq $m0 # ap[j]*bp[i]
add %rax,$hi0
mov ($np,$j,8),%rax
adc \$0,%rdx
add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
mov %rdx,$hi0
adc \$0,$hi0
lea 1($j),$j # j++
mulq $m1 # np[j]*m1
cmp $num,$j
jne .Linner
add %rax,$hi1
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$j,8),$lo0
adc \$0,%rdx
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$hi1
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
add $lo0,$hi1 # pull upmost overflow bit
adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
lea 1($i),$i # i++
cmp $num,$i
jb .Louter
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
mov $num,$j # j=num
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 8(%rsp,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
dec $j # doesn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
mov \$-1,%rbx
xor %rax,%rbx # not %rax
xor $i,$i
mov $num,$j # j=num
.Lcopy: # conditional copy
mov ($rp,$i,8),%rcx
mov (%rsp,$i,8),%rdx
and %rbx,%rcx
and %rax,%rdx
mov $num,(%rsp,$i,8) # zap temporary vector
or %rcx,%rdx
mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lmul_epilogue:
ret
.cfi_endproc
.size bn_mul_mont,.-bn_mul_mont
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.type bn_mul4x_mont,\@function,6
.align 16
bn_mul4x_mont:
.cfi_startproc
mov ${num}d,${num}d
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
and \$0x80100,%r11d
cmp \$0x80100,%r11d
je .Lmulx4x_enter
___
$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
neg $num
mov %rsp,%r11
lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4))
neg $num # restore
and \$-1024,%r10 # minimize TLB usage
sub %r10,%r11
and \$-4096,%r11
lea (%r10,%r11),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul4x_page_walk
jmp .Lmul4x_page_walk_done
.Lmul4x_page_walk:
lea -4096(%rsp),%rsp
mov (%rsp),%r11
cmp %r10,%rsp
ja .Lmul4x_page_walk
.Lmul4x_page_walk_done:
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
.Lmul4x_body:
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp
___
$bp="%r12";
$code.=<<___;
mov ($n0),$n0 # pull n0[0] value
mov ($bp),$m0 # m0=bp[0]
mov ($ap),%rax
xor $i,$i # i=0
xor $j,$j # j=0
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$A[0]
mov ($np),%rax
imulq $A[0],$m1 # "tp[0]"*n0
mov %rdx,$A[1]
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0
add %rax,$A[1]
mov 8($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1
add %rax,$N[1]
mov 16($ap),%rax
adc \$0,%rdx
add $A[1],$N[1]
lea 4($j),$j # j++
adc \$0,%rdx
mov $N[1],(%rsp)
mov %rdx,$N[0]
jmp .L1st4x
.align 16
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov ($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov 8($np,$j,8),%rax
adc \$0,%rdx
lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov -16($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
jb .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
mov $N[0],-8(%rsp,$j,8)
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
lea 1($i),$i # i++
.align 4
.Louter4x:
mov ($bp,$i,8),$m0 # m0=bp[i]
xor $j,$j # j=0
mov (%rsp),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
imulq $A[0],$m1 # tp[0]*n0
mov %rdx,$A[1]
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
mov 8($ap),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8($np),%rax
adc \$0,%rdx
add 8(%rsp),$A[1] # +tp[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov 16($ap),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4($j),$j # j+=2
adc \$0,%rdx
mov $N[1],(%rsp) # tp[j-1]
mov %rdx,$N[0]
jmp .Linner4x
.align 16
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
add -8(%rsp,$j,8),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov ($np,$j,8),%rax
adc \$0,%rdx
add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8($np,$j,8),%rax
adc \$0,%rdx
add 8(%rsp,$j,8),$A[1]
adc \$0,%rdx
lea 4($j),$j # j++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov -16($ap,$j,8),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
cmp $num,$j
jb .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -16($np,$j,8),%rax
adc \$0,%rdx
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j,8),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov -8($np,$j,8),%rax
adc \$0,%rdx
add -8(%rsp,$j,8),$A[1]
adc \$0,%rdx
lea 1($i),$i # i++
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
mov %rdx,$N[0]
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
add (%rsp,$num,8),$N[0] # pull upmost overflow bit
adc \$0,$N[1]
mov $N[0],-8(%rsp,$j,8)
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
cmp $num,$i
jb .Louter4x
___
{
my @ri=("%rax","%rdx",$m0,$m1);
$code.=<<___;
mov 16(%rsp,$num,8),$rp # restore $rp
lea -4($num),$j
mov 0(%rsp),@ri[0] # tp[0]
mov 8(%rsp),@ri[1] # tp[1]
shr \$2,$j # j=num/4-1
lea (%rsp),$ap # borrow ap for tp
xor $i,$i # i=0 and clear CF!
sub 0($np),@ri[0]
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
.Lsub4x:
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 16($np,$i,8),@ri[2]
mov 32($ap,$i,8),@ri[0] # tp[i+1]
mov 40($ap,$i,8),@ri[1]
sbb 24($np,$i,8),@ri[3]
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 32($np,$i,8),@ri[0]
mov 48($ap,$i,8),@ri[2]
mov 56($ap,$i,8),@ri[3]
sbb 40($np,$i,8),@ri[1]
lea 4($i),$i # i++
dec $j # doesn't affect CF!
jnz .Lsub4x
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov 32($ap,$i,8),@ri[0] # load overflow bit
sbb 16($np,$i,8),@ri[2]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb 24($np,$i,8),@ri[3]
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
sbb \$0,@ri[0] # handle upmost overflow bit
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
pxor %xmm0,%xmm0
movq @ri[0],%xmm4
pcmpeqd %xmm5,%xmm5
pshufd \$0,%xmm4,%xmm4
mov $num,$j
pxor %xmm4,%xmm5
shr \$2,$j # j=num/4
xor %eax,%eax # i=0
jmp .Lcopy4x
.align 16
.Lcopy4x: # conditional copy
movdqa (%rsp,%rax),%xmm1
movdqu ($rp,%rax),%xmm2
pand %xmm4,%xmm1
pand %xmm5,%xmm2
movdqa 16(%rsp,%rax),%xmm3
movdqa %xmm0,(%rsp,%rax)
por %xmm2,%xmm1
movdqu 16($rp,%rax),%xmm2
movdqu %xmm1,($rp,%rax)
pand %xmm4,%xmm3
pand %xmm5,%xmm2
movdqa %xmm0,16(%rsp,%rax)
por %xmm2,%xmm3
movdqu %xmm3,16($rp,%rax)
lea 32(%rax),%rax
dec $j
jnz .Lcopy4x
___
}
$code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp
.cfi_def_cfa %rsi, 8
mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
ret
.cfi_endproc
.size bn_mul4x_mont,.-bn_mul4x_mont
___
}}}