in src/crypto/fipsmodule/bn/asm/x86_64-mont5.pl [426:1067]
sub \$1,$j
jnz .Lcopy
mov 8(%rsp,$num,8),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lmul_epilogue:
ret
.cfi_endproc
.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
___
{{{
my @A=("%r10","%r11");
my @N=("%r13","%rdi");
$code.=<<___;
.type bn_mul4x_mont_gather5,\@function,6
.align 32
bn_mul4x_mont_gather5:
.cfi_startproc
.byte 0x67
mov %rsp,%rax
.cfi_def_cfa_register %rax
.Lmul4x_enter:
___
$code.=<<___ if ($addx);
and \$0x80108,%r11d
cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
je .Lmulx4x_enter
___
$code.=<<___;
push %rbx
.cfi_push %rbx
push %rbp
.cfi_push %rbp
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
.Lmul4x_prologue:
.byte 0x67
shl \$3,${num}d # convert $num to bytes
lea ($num,$num,2),%r10 # 3*$num in bytes
neg $num # -$num
##############################################################
# Ensure that stack frame doesn't alias with $rptr+3*$num
# modulo 4096, which covers ret[num], am[num] and n[num]
# (see bn_exp.c). This is done to allow memory disambiguation
# logic do its magic. [Extra [num] is allocated in order
# to align with bn_power5's frame, which is cleansed after
# completing exponentiation. Extra 256 bytes is for power mask
# calculated from 7th argument, the index.]
#
lea -320(%rsp,$num,2),%r11
mov %rsp,%rbp
sub $rp,%r11
and \$4095,%r11
cmp %r11,%r10
jb .Lmul4xsp_alt
sub %r11,%rbp # align with $rp
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
jmp .Lmul4xsp_done
.align 32
.Lmul4xsp_alt:
lea 4096-320(,$num,2),%r10
lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
sub %r10,%r11
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rbp
.Lmul4xsp_done:
and \$-64,%rbp
mov %rsp,%r11
sub %rbp,%r11
and \$-4096,%r11
lea (%rbp,%r11),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmul4x_page_walk
jmp .Lmul4x_page_walk_done
.Lmul4x_page_walk:
lea -4096(%rsp),%rsp
mov (%rsp),%r10
cmp %rbp,%rsp
ja .Lmul4x_page_walk
.Lmul4x_page_walk_done:
neg $num
mov %rax,40(%rsp)
.cfi_cfa_expression %rsp+40,deref,+8
.Lmul4x_body:
call mul4x_internal
mov 40(%rsp),%rsi # restore %rsp
.cfi_def_cfa %rsi,8
mov \$1,%rax
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbp
.cfi_restore %rbp
mov -8(%rsi),%rbx
.cfi_restore %rbx
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lmul4x_epilogue:
ret
.cfi_endproc
.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
.type mul4x_internal,\@abi-omnipotent
.align 32
mul4x_internal:
.cfi_startproc
shl \$5,$num # $num was in bytes
movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
lea .Linc(%rip),%rax
lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
shr \$5,$num # restore $num
___
$bp="%r12";
$STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size
$tp=$i;
$code.=<<___;
movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
lea 128(%rdx),$bp # size optimization
pshufd \$0,%xmm5,%xmm5 # broadcast index
movdqa %xmm1,%xmm4
.byte 0x67,0x67
movdqa %xmm1,%xmm2
___
########################################################################
# calculate mask by comparing 0..31 to index and save result to stack
#
$code.=<<___;
paddd %xmm0,%xmm1
pcmpeqd %xmm5,%xmm0 # compare to 1,0
.byte 0x67
movdqa %xmm4,%xmm3
___
for($i=0;$i<$STRIDE/16-4;$i+=4) {
$code.=<<___;
paddd %xmm1,%xmm2
pcmpeqd %xmm5,%xmm1 # compare to 3,2
movdqa %xmm0,`16*($i+0)+112`(%r10)
movdqa %xmm4,%xmm0
paddd %xmm2,%xmm3
pcmpeqd %xmm5,%xmm2 # compare to 5,4
movdqa %xmm1,`16*($i+1)+112`(%r10)
movdqa %xmm4,%xmm1
paddd %xmm3,%xmm0
pcmpeqd %xmm5,%xmm3 # compare to 7,6
movdqa %xmm2,`16*($i+2)+112`(%r10)
movdqa %xmm4,%xmm2
paddd %xmm0,%xmm1
pcmpeqd %xmm5,%xmm0
movdqa %xmm3,`16*($i+3)+112`(%r10)
movdqa %xmm4,%xmm3
___
}
$code.=<<___; # last iteration can be optimized
paddd %xmm1,%xmm2
pcmpeqd %xmm5,%xmm1
movdqa %xmm0,`16*($i+0)+112`(%r10)
paddd %xmm2,%xmm3
.byte 0x67
pcmpeqd %xmm5,%xmm2
movdqa %xmm1,`16*($i+1)+112`(%r10)
pcmpeqd %xmm5,%xmm3
movdqa %xmm2,`16*($i+2)+112`(%r10)
pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
pand `16*($i+1)-128`($bp),%xmm1
pand `16*($i+2)-128`($bp),%xmm2
movdqa %xmm3,`16*($i+3)+112`(%r10)
pand `16*($i+3)-128`($bp),%xmm3
por %xmm2,%xmm0
por %xmm3,%xmm1
___
for($i=0;$i<$STRIDE/16-4;$i+=4) {
$code.=<<___;
movdqa `16*($i+0)-128`($bp),%xmm4
movdqa `16*($i+1)-128`($bp),%xmm5
movdqa `16*($i+2)-128`($bp),%xmm2
pand `16*($i+0)+112`(%r10),%xmm4
movdqa `16*($i+3)-128`($bp),%xmm3
pand `16*($i+1)+112`(%r10),%xmm5
por %xmm4,%xmm0
pand `16*($i+2)+112`(%r10),%xmm2
por %xmm5,%xmm1
pand `16*($i+3)+112`(%r10),%xmm3
por %xmm2,%xmm0
por %xmm3,%xmm1
___
}
$code.=<<___;
por %xmm1,%xmm0
pshufd \$0x4e,%xmm0,%xmm1
por %xmm1,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0]
mov %r13,16+8(%rsp) # save end of b[num]
mov $rp, 56+8(%rsp) # save $rp
mov ($n0),$n0 # pull n0[0] value
mov ($ap),%rax
lea ($ap,$num),$ap # end of a[num]
neg $num
mov $n0,$m1
mulq $m0 # ap[0]*bp[0]
mov %rax,$A[0]
mov ($np),%rax
imulq $A[0],$m1 # "tp[0]"*n0
lea 64+8(%rsp),$tp
mov %rdx,$A[1]
mulq $m1 # np[0]*m1
add %rax,$A[0] # discarded
mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0
add %rax,$A[1]
mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1
add %rax,$N[1]
mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1]
lea 4*8($num),$j # j=4
lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp)
mov %rdx,$N[0]
jmp .L1st4x
.align 32
.L1st4x:
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov 8*0($np),%rax
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov 8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
lea 8*4($np),$np
adc \$0,%rdx
mov $N[1],($tp) # tp[j-1]
mov %rdx,$N[0]
add \$32,$j # j+=4
jnz .L1st4x
mulq $m0 # ap[j]*bp[0]
add %rax,$A[0]
mov -8*2($np),%rax
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[0]
add %rax,$A[1]
mov -8*1($np),%rax
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[0]
lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
mov $N[0],-8($tp)
jmp .Louter4x
.align 32
.Louter4x:
lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
___
for($i=0;$i<$STRIDE/16;$i+=4) {
$code.=<<___;
movdqa `16*($i+0)-128`($bp),%xmm0
movdqa `16*($i+1)-128`($bp),%xmm1
movdqa `16*($i+2)-128`($bp),%xmm2
movdqa `16*($i+3)-128`($bp),%xmm3
pand `16*($i+0)-128`(%rdx),%xmm0
pand `16*($i+1)-128`(%rdx),%xmm1
por %xmm0,%xmm4
pand `16*($i+2)-128`(%rdx),%xmm2
por %xmm1,%xmm5
pand `16*($i+3)-128`(%rdx),%xmm3
por %xmm2,%xmm4
por %xmm3,%xmm5
___
}
$code.=<<___;
por %xmm5,%xmm4
pshufd \$0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[i]
mov ($tp,$num),$A[0]
mov $n0,$m1
mulq $m0 # ap[0]*bp[i]
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
mov ($np),%rax
adc \$0,%rdx
imulq $A[0],$m1 # tp[0]*n0
mov %rdx,$A[1]
mov $N[1],($tp) # store upmost overflow bit
lea ($tp,$num),$tp # rewind $tp
mulq $m1 # np[0]*m1
add %rax,$A[0] # "$N[0]", discarded
mov 8($ap,$num),%rax
adc \$0,%rdx
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1] # +tp[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov 16($ap,$num),%rax
adc \$0,%rdx
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
lea 4*8($num),$j # j=4
lea 8*4($np),$np
adc \$0,%rdx
mov %rdx,$N[0]
jmp .Linner4x
.align 32
.Linner4x:
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov -8*1($np),%rax
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov 8*0($np),%rax
adc \$0,%rdx
add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov 8($ap,$j),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[1],-16($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov 8*1($np),%rax
adc \$0,%rdx
add 8($tp),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov 16($ap,$j),%rax
adc \$0,%rdx
add $A[1],$N[1]
lea 8*4($np),$np
adc \$0,%rdx
mov $N[0],-8($tp) # tp[j-1]
mov %rdx,$N[0]
add \$32,$j # j+=4
jnz .Linner4x
mulq $m0 # ap[j]*bp[i]
add %rax,$A[0]
mov -8*2($np),%rax
adc \$0,%rdx
add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
lea 32($tp),$tp
adc \$0,%rdx
mov %rdx,$A[1]
mulq $m1 # np[j]*m1
add %rax,$N[0]
mov -8($ap),%rax
adc \$0,%rdx
add $A[0],$N[0]
adc \$0,%rdx
mov $N[1],-32($tp) # tp[j-1]
mov %rdx,$N[1]
mulq $m0 # ap[j]*bp[i]
add %rax,$A[1]
mov $m1,%rax
mov -8*1($np),$m1
adc \$0,%rdx
add -8($tp),$A[1]
adc \$0,%rdx
mov %rdx,$A[0]
mulq $m1 # np[j]*m1
add %rax,$N[1]
mov ($ap,$num),%rax # ap[0]
adc \$0,%rdx
add $A[1],$N[1]
adc \$0,%rdx
mov $N[0],-24($tp) # tp[j-1]
mov %rdx,$N[0]
mov $N[1],-16($tp) # tp[j-1]
lea ($np,$num),$np # rewind $np
xor $N[1],$N[1]
add $A[0],$N[0]
adc \$0,$N[1]
add ($tp),$N[0] # pull upmost overflow bit
adc \$0,$N[1] # upmost overflow bit
mov $N[0],-8($tp)
cmp 16+8(%rsp),$bp
jb .Louter4x
___
if (1) {
$code.=<<___;
xor %rax,%rax
sub $N[0],$m1 # compare top-most words
adc $j,$j # $j is zero
or $j,$N[1]
sub $N[1],%rax # %rax=-$N[1]
lea ($tp,$num),%rbx # tptr in .sqr4x_sub
mov ($np),%r12
lea ($np),%rbp # nptr in .sqr4x_sub
mov %r9,%rcx
sar \$3+2,%rcx
mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
dec %r12 # so that after 'not' we get -n[0]
xor %r10,%r10
mov 8*1(%rbp),%r13
mov 8*2(%rbp),%r14
mov 8*3(%rbp),%r15
jmp .Lsqr4x_sub_entry
___
} else {
my @ri=("%rax",$bp,$m0,$m1);
my $rp="%rdx";
$code.=<<___
xor \$1,$N[1]
lea ($tp,$num),$tp # rewind $tp
sar \$5,$num # cf=0
lea ($np,$N[1],8),$np
mov 56+8(%rsp),$rp # restore $rp
jmp .Lsub4x
.align 32
.Lsub4x:
.byte 0x66
mov 8*0($tp),@ri[0]
mov 8*1($tp),@ri[1]
.byte 0x66
sbb 16*0($np),@ri[0]
mov 8*2($tp),@ri[2]
sbb 16*1($np),@ri[1]
mov 3*8($tp),@ri[3]
lea 4*8($tp),$tp
sbb 16*2($np),@ri[2]
mov @ri[0],8*0($rp)
sbb 16*3($np),@ri[3]
lea 16*4($np),$np
mov @ri[1],8*1($rp)
mov @ri[2],8*2($rp)
mov @ri[3],8*3($rp)
lea 8*4($rp),$rp
inc $num
jnz .Lsub4x
ret
___
}
$code.=<<___;
.cfi_endproc
.size mul4x_internal,.-mul4x_internal
___
}}}