in src/crypto/fipsmodule/bn/asm/rsaz-avx2.pl [901:1467]
sub \$64,%rsp
# unaligned 256-bit load that crosses page boundary can
# cause severe performance degradation here, so if $ap does
# cross page boundary, swap it with $bp [meaning that caller
# is advised to lay down $ap and $bp next to each other, so
# that only one can cross page boundary].
.byte 0x67,0x67
mov $ap, $tmp
and \$4095, $tmp
add \$32*10, $tmp
shr \$12, $tmp
mov $ap, $tmp
cmovnz $bp, $ap
cmovnz $tmp, $bp
mov $np, $tmp
sub \$-128,$ap # size optimization
sub \$-128,$np
sub \$-128,$rp
and \$4095, $tmp # see if $np crosses page
add \$32*10, $tmp
.byte 0x67,0x67
shr \$12, $tmp
jz .Lmul_1024_no_n_copy
# unaligned 256-bit load that crosses page boundary can
# cause severe performance degradation here, so if $np does
# cross page boundary, copy it to stack and make sure stack
# frame doesn't...
sub \$32*10,%rsp
vmovdqu 32*0-128($np), $ACC0
and \$-512, %rsp
vmovdqu 32*1-128($np), $ACC1
vmovdqu 32*2-128($np), $ACC2
vmovdqu 32*3-128($np), $ACC3
vmovdqu 32*4-128($np), $ACC4
vmovdqu 32*5-128($np), $ACC5
vmovdqu 32*6-128($np), $ACC6
vmovdqu 32*7-128($np), $ACC7
vmovdqu 32*8-128($np), $ACC8
lea 64+128(%rsp),$np
vmovdqu $ACC0, 32*0-128($np)
vpxor $ACC0, $ACC0, $ACC0
vmovdqu $ACC1, 32*1-128($np)
vpxor $ACC1, $ACC1, $ACC1
vmovdqu $ACC2, 32*2-128($np)
vpxor $ACC2, $ACC2, $ACC2
vmovdqu $ACC3, 32*3-128($np)
vpxor $ACC3, $ACC3, $ACC3
vmovdqu $ACC4, 32*4-128($np)
vpxor $ACC4, $ACC4, $ACC4
vmovdqu $ACC5, 32*5-128($np)
vpxor $ACC5, $ACC5, $ACC5
vmovdqu $ACC6, 32*6-128($np)
vpxor $ACC6, $ACC6, $ACC6
vmovdqu $ACC7, 32*7-128($np)
vpxor $ACC7, $ACC7, $ACC7
vmovdqu $ACC8, 32*8-128($np)
vmovdqa $ACC0, $ACC8
vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
.Lmul_1024_no_n_copy:
and \$-64,%rsp
mov ($bp), %rbx
vpbroadcastq ($bp), $Bi
vmovdqu $ACC0, (%rsp) # clear top of stack
xor $r0, $r0
.byte 0x67
xor $r1, $r1
xor $r2, $r2
xor $r3, $r3
vmovdqu .Land_mask(%rip), $AND_MASK
mov \$9, $i
vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
jmp .Loop_mul_1024
.align 32
.Loop_mul_1024:
vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
mov %rbx, %rax
imulq -128($ap), %rax
add $r0, %rax
mov %rbx, $r1
imulq 8-128($ap), $r1
add 8(%rsp), $r1
mov %rax, $r0
imull $n0, %eax
and \$0x1fffffff, %eax
mov %rbx, $r2
imulq 16-128($ap), $r2
add 16(%rsp), $r2
mov %rbx, $r3
imulq 24-128($ap), $r3
add 24(%rsp), $r3
vpmuludq 32*1-128($ap),$Bi,$TEMP0
vmovd %eax, $Yi
vpaddq $TEMP0,$ACC1,$ACC1
vpmuludq 32*2-128($ap),$Bi,$TEMP1
vpbroadcastq $Yi, $Yi
vpaddq $TEMP1,$ACC2,$ACC2
vpmuludq 32*3-128($ap),$Bi,$TEMP2
vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
vpaddq $TEMP2,$ACC3,$ACC3
vpmuludq 32*4-128($ap),$Bi,$TEMP0
vpaddq $TEMP0,$ACC4,$ACC4
vpmuludq 32*5-128($ap),$Bi,$TEMP1
vpaddq $TEMP1,$ACC5,$ACC5
vpmuludq 32*6-128($ap),$Bi,$TEMP2
vpaddq $TEMP2,$ACC6,$ACC6
vpmuludq 32*7-128($ap),$Bi,$TEMP0
vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
vpaddq $TEMP0,$ACC7,$ACC7
vpmuludq 32*8-128($ap),$Bi,$TEMP1
vpbroadcastq 8($bp), $Bi
vpaddq $TEMP1,$ACC8,$ACC8
mov %rax,%rdx
imulq -128($np),%rax
add %rax,$r0
mov %rdx,%rax
imulq 8-128($np),%rax
add %rax,$r1
mov %rdx,%rax
imulq 16-128($np),%rax
add %rax,$r2
shr \$29, $r0
imulq 24-128($np),%rdx
add %rdx,$r3
add $r0, $r1
vpmuludq 32*1-128($np),$Yi,$TEMP2
vmovq $Bi, %rbx
vpaddq $TEMP2,$ACC1,$ACC1
vpmuludq 32*2-128($np),$Yi,$TEMP0
vpaddq $TEMP0,$ACC2,$ACC2
vpmuludq 32*3-128($np),$Yi,$TEMP1
vpaddq $TEMP1,$ACC3,$ACC3
vpmuludq 32*4-128($np),$Yi,$TEMP2
vpaddq $TEMP2,$ACC4,$ACC4
vpmuludq 32*5-128($np),$Yi,$TEMP0
vpaddq $TEMP0,$ACC5,$ACC5
vpmuludq 32*6-128($np),$Yi,$TEMP1
vpaddq $TEMP1,$ACC6,$ACC6
vpmuludq 32*7-128($np),$Yi,$TEMP2
vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
vpaddq $TEMP2,$ACC7,$ACC7
vpmuludq 32*8-128($np),$Yi,$TEMP0
vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
vpaddq $TEMP0,$ACC8,$ACC8
mov %rbx, %rax
imulq -128($ap),%rax
add %rax,$r1
vmovdqu -8+32*1-128($ap),$TEMP1
mov %rbx, %rax
imulq 8-128($ap),%rax
add %rax,$r2
vmovdqu -8+32*2-128($ap),$TEMP2
mov $r1, %rax
vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
imull $n0, %eax
vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
and \$0x1fffffff, %eax
imulq 16-128($ap),%rbx
add %rbx,$r3
vpmuludq $Bi,$TEMP1,$TEMP1
vmovd %eax, $Yi
vmovdqu -8+32*3-128($ap),$TEMP0
vpaddq $TEMP1,$ACC1,$ACC1
vpmuludq $Bi,$TEMP2,$TEMP2
vpbroadcastq $Yi, $Yi
vmovdqu -8+32*4-128($ap),$TEMP1
vpaddq $TEMP2,$ACC2,$ACC2
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -8+32*5-128($ap),$TEMP2
vpaddq $TEMP0,$ACC3,$ACC3
vpmuludq $Bi,$TEMP1,$TEMP1
vmovdqu -8+32*6-128($ap),$TEMP0
vpaddq $TEMP1,$ACC4,$ACC4
vpmuludq $Bi,$TEMP2,$TEMP2
vmovdqu -8+32*7-128($ap),$TEMP1
vpaddq $TEMP2,$ACC5,$ACC5
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -8+32*8-128($ap),$TEMP2
vpaddq $TEMP0,$ACC6,$ACC6
vpmuludq $Bi,$TEMP1,$TEMP1
vmovdqu -8+32*9-128($ap),$ACC9
vpaddq $TEMP1,$ACC7,$ACC7
vpmuludq $Bi,$TEMP2,$TEMP2
vpaddq $TEMP2,$ACC8,$ACC8
vpmuludq $Bi,$ACC9,$ACC9
vpbroadcastq 16($bp), $Bi
mov %rax,%rdx
imulq -128($np),%rax
add %rax,$r1
vmovdqu -8+32*1-128($np),$TEMP0
mov %rdx,%rax
imulq 8-128($np),%rax
add %rax,$r2
vmovdqu -8+32*2-128($np),$TEMP1
shr \$29, $r1
imulq 16-128($np),%rdx
add %rdx,$r3
add $r1, $r2
vpmuludq $Yi,$TEMP0,$TEMP0
vmovq $Bi, %rbx
vmovdqu -8+32*3-128($np),$TEMP2
vpaddq $TEMP0,$ACC1,$ACC1
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -8+32*4-128($np),$TEMP0
vpaddq $TEMP1,$ACC2,$ACC2
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -8+32*5-128($np),$TEMP1
vpaddq $TEMP2,$ACC3,$ACC3
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -8+32*6-128($np),$TEMP2
vpaddq $TEMP0,$ACC4,$ACC4
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -8+32*7-128($np),$TEMP0
vpaddq $TEMP1,$ACC5,$ACC5
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -8+32*8-128($np),$TEMP1
vpaddq $TEMP2,$ACC6,$ACC6
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -8+32*9-128($np),$TEMP2
vpaddq $TEMP0,$ACC7,$ACC7
vpmuludq $Yi,$TEMP1,$TEMP1
vpaddq $TEMP1,$ACC8,$ACC8
vpmuludq $Yi,$TEMP2,$TEMP2
vpaddq $TEMP2,$ACC9,$ACC9
vmovdqu -16+32*1-128($ap),$TEMP0
mov %rbx,%rax
imulq -128($ap),%rax
add $r2,%rax
vmovdqu -16+32*2-128($ap),$TEMP1
mov %rax,$r2
imull $n0, %eax
and \$0x1fffffff, %eax
imulq 8-128($ap),%rbx
add %rbx,$r3
vpmuludq $Bi,$TEMP0,$TEMP0
vmovd %eax, $Yi
vmovdqu -16+32*3-128($ap),$TEMP2
vpaddq $TEMP0,$ACC1,$ACC1
vpmuludq $Bi,$TEMP1,$TEMP1
vpbroadcastq $Yi, $Yi
vmovdqu -16+32*4-128($ap),$TEMP0
vpaddq $TEMP1,$ACC2,$ACC2
vpmuludq $Bi,$TEMP2,$TEMP2
vmovdqu -16+32*5-128($ap),$TEMP1
vpaddq $TEMP2,$ACC3,$ACC3
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -16+32*6-128($ap),$TEMP2
vpaddq $TEMP0,$ACC4,$ACC4
vpmuludq $Bi,$TEMP1,$TEMP1
vmovdqu -16+32*7-128($ap),$TEMP0
vpaddq $TEMP1,$ACC5,$ACC5
vpmuludq $Bi,$TEMP2,$TEMP2
vmovdqu -16+32*8-128($ap),$TEMP1
vpaddq $TEMP2,$ACC6,$ACC6
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -16+32*9-128($ap),$TEMP2
vpaddq $TEMP0,$ACC7,$ACC7
vpmuludq $Bi,$TEMP1,$TEMP1
vpaddq $TEMP1,$ACC8,$ACC8
vpmuludq $Bi,$TEMP2,$TEMP2
vpbroadcastq 24($bp), $Bi
vpaddq $TEMP2,$ACC9,$ACC9
vmovdqu -16+32*1-128($np),$TEMP0
mov %rax,%rdx
imulq -128($np),%rax
add %rax,$r2
vmovdqu -16+32*2-128($np),$TEMP1
imulq 8-128($np),%rdx
add %rdx,$r3
shr \$29, $r2
vpmuludq $Yi,$TEMP0,$TEMP0
vmovq $Bi, %rbx
vmovdqu -16+32*3-128($np),$TEMP2
vpaddq $TEMP0,$ACC1,$ACC1
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -16+32*4-128($np),$TEMP0
vpaddq $TEMP1,$ACC2,$ACC2
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -16+32*5-128($np),$TEMP1
vpaddq $TEMP2,$ACC3,$ACC3
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -16+32*6-128($np),$TEMP2
vpaddq $TEMP0,$ACC4,$ACC4
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -16+32*7-128($np),$TEMP0
vpaddq $TEMP1,$ACC5,$ACC5
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -16+32*8-128($np),$TEMP1
vpaddq $TEMP2,$ACC6,$ACC6
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -16+32*9-128($np),$TEMP2
vpaddq $TEMP0,$ACC7,$ACC7
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -24+32*1-128($ap),$TEMP0
vpaddq $TEMP1,$ACC8,$ACC8
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -24+32*2-128($ap),$TEMP1
vpaddq $TEMP2,$ACC9,$ACC9
add $r2, $r3
imulq -128($ap),%rbx
add %rbx,$r3
mov $r3, %rax
imull $n0, %eax
and \$0x1fffffff, %eax
vpmuludq $Bi,$TEMP0,$TEMP0
vmovd %eax, $Yi
vmovdqu -24+32*3-128($ap),$TEMP2
vpaddq $TEMP0,$ACC1,$ACC1
vpmuludq $Bi,$TEMP1,$TEMP1
vpbroadcastq $Yi, $Yi
vmovdqu -24+32*4-128($ap),$TEMP0
vpaddq $TEMP1,$ACC2,$ACC2
vpmuludq $Bi,$TEMP2,$TEMP2
vmovdqu -24+32*5-128($ap),$TEMP1
vpaddq $TEMP2,$ACC3,$ACC3
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -24+32*6-128($ap),$TEMP2
vpaddq $TEMP0,$ACC4,$ACC4
vpmuludq $Bi,$TEMP1,$TEMP1
vmovdqu -24+32*7-128($ap),$TEMP0
vpaddq $TEMP1,$ACC5,$ACC5
vpmuludq $Bi,$TEMP2,$TEMP2
vmovdqu -24+32*8-128($ap),$TEMP1
vpaddq $TEMP2,$ACC6,$ACC6
vpmuludq $Bi,$TEMP0,$TEMP0
vmovdqu -24+32*9-128($ap),$TEMP2
vpaddq $TEMP0,$ACC7,$ACC7
vpmuludq $Bi,$TEMP1,$TEMP1
vpaddq $TEMP1,$ACC8,$ACC8
vpmuludq $Bi,$TEMP2,$TEMP2
vpbroadcastq 32($bp), $Bi
vpaddq $TEMP2,$ACC9,$ACC9
add \$32, $bp # $bp++
vmovdqu -24+32*1-128($np),$TEMP0
imulq -128($np),%rax
add %rax,$r3
shr \$29, $r3
vmovdqu -24+32*2-128($np),$TEMP1
vpmuludq $Yi,$TEMP0,$TEMP0
vmovq $Bi, %rbx
vmovdqu -24+32*3-128($np),$TEMP2
vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
vpaddq $TEMP1,$ACC2,$ACC1
vmovdqu -24+32*4-128($np),$TEMP0
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -24+32*5-128($np),$TEMP1
vpaddq $TEMP2,$ACC3,$ACC2
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -24+32*6-128($np),$TEMP2
vpaddq $TEMP0,$ACC4,$ACC3
vpmuludq $Yi,$TEMP1,$TEMP1
vmovdqu -24+32*7-128($np),$TEMP0
vpaddq $TEMP1,$ACC5,$ACC4
vpmuludq $Yi,$TEMP2,$TEMP2
vmovdqu -24+32*8-128($np),$TEMP1
vpaddq $TEMP2,$ACC6,$ACC5
vpmuludq $Yi,$TEMP0,$TEMP0
vmovdqu -24+32*9-128($np),$TEMP2
mov $r3, $r0
vpaddq $TEMP0,$ACC7,$ACC6
vpmuludq $Yi,$TEMP1,$TEMP1
add (%rsp), $r0
vpaddq $TEMP1,$ACC8,$ACC7
vpmuludq $Yi,$TEMP2,$TEMP2
vmovq $r3, $TEMP1
vpaddq $TEMP2,$ACC9,$ACC8
dec $i
jnz .Loop_mul_1024
___
# (*) Original implementation was correcting ACC1-ACC3 for overflow
# after 7 loop runs, or after 28 iterations, or 56 additions.
# But as we underutilize resources, it's possible to correct in
# each iteration with marginal performance loss. But then, as
# we do it in each iteration, we can correct less digits, and
# avoid performance penalties completely.
$TEMP0 = $ACC9;
$TEMP3 = $Bi;
$TEMP4 = $Yi;
$code.=<<___;
vpaddq (%rsp), $TEMP1, $ACC0
vpsrlq \$29, $ACC0, $TEMP1
vpand $AND_MASK, $ACC0, $ACC0
vpsrlq \$29, $ACC1, $TEMP2
vpand $AND_MASK, $ACC1, $ACC1
vpsrlq \$29, $ACC2, $TEMP3
vpermq \$0x93, $TEMP1, $TEMP1
vpand $AND_MASK, $ACC2, $ACC2
vpsrlq \$29, $ACC3, $TEMP4
vpermq \$0x93, $TEMP2, $TEMP2
vpand $AND_MASK, $ACC3, $ACC3
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
vpermq \$0x93, $TEMP3, $TEMP3
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
vpermq \$0x93, $TEMP4, $TEMP4
vpaddq $TEMP0, $ACC0, $ACC0
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
vpaddq $TEMP1, $ACC1, $ACC1
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
vpaddq $TEMP2, $ACC2, $ACC2
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
vpaddq $TEMP3, $ACC3, $ACC3
vpaddq $TEMP4, $ACC4, $ACC4
vpsrlq \$29, $ACC0, $TEMP1
vpand $AND_MASK, $ACC0, $ACC0
vpsrlq \$29, $ACC1, $TEMP2
vpand $AND_MASK, $ACC1, $ACC1
vpsrlq \$29, $ACC2, $TEMP3
vpermq \$0x93, $TEMP1, $TEMP1
vpand $AND_MASK, $ACC2, $ACC2
vpsrlq \$29, $ACC3, $TEMP4
vpermq \$0x93, $TEMP2, $TEMP2
vpand $AND_MASK, $ACC3, $ACC3
vpermq \$0x93, $TEMP3, $TEMP3
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
vpermq \$0x93, $TEMP4, $TEMP4
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
vpaddq $TEMP0, $ACC0, $ACC0
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
vpaddq $TEMP1, $ACC1, $ACC1
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
vpaddq $TEMP2, $ACC2, $ACC2
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
vpaddq $TEMP3, $ACC3, $ACC3
vpaddq $TEMP4, $ACC4, $ACC4
vmovdqu $ACC0, 0-128($rp)
vmovdqu $ACC1, 32-128($rp)
vmovdqu $ACC2, 64-128($rp)
vmovdqu $ACC3, 96-128($rp)
___
$TEMP5=$ACC0;
$code.=<<___;
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
vpand $AND_MASK, $ACC5, $ACC5
vpsrlq \$29, $ACC6, $TEMP3
vpermq \$0x93, $TEMP1, $TEMP1
vpand $AND_MASK, $ACC6, $ACC6
vpsrlq \$29, $ACC7, $TEMP4
vpermq \$0x93, $TEMP2, $TEMP2
vpand $AND_MASK, $ACC7, $ACC7
vpsrlq \$29, $ACC8, $TEMP5
vpermq \$0x93, $TEMP3, $TEMP3
vpand $AND_MASK, $ACC8, $ACC8
vpermq \$0x93, $TEMP4, $TEMP4
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
vpermq \$0x93, $TEMP5, $TEMP5
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
vpaddq $TEMP0, $ACC4, $ACC4
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
vpaddq $TEMP1, $ACC5, $ACC5
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
vpaddq $TEMP2, $ACC6, $ACC6
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
vpand $AND_MASK, $ACC5, $ACC5
vpsrlq \$29, $ACC6, $TEMP3
vpermq \$0x93, $TEMP1, $TEMP1
vpand $AND_MASK, $ACC6, $ACC6
vpsrlq \$29, $ACC7, $TEMP4
vpermq \$0x93, $TEMP2, $TEMP2
vpand $AND_MASK, $ACC7, $ACC7
vpsrlq \$29, $ACC8, $TEMP5
vpermq \$0x93, $TEMP3, $TEMP3
vpand $AND_MASK, $ACC8, $ACC8
vpermq \$0x93, $TEMP4, $TEMP4
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
vpermq \$0x93, $TEMP5, $TEMP5
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
vpaddq $TEMP0, $ACC4, $ACC4
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
vpaddq $TEMP1, $ACC5, $ACC5
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
vpaddq $TEMP2, $ACC6, $ACC6
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
vmovdqu $ACC4, 128-128($rp)
vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC6, 192-128($rp)
vmovdqu $ACC7, 224-128($rp)
vmovdqu $ACC8, 256-128($rp)
vzeroupper
mov %rbp, %rax
.cfi_def_cfa_register %rax
___
$code.=<<___ if ($win64);
.Lmul_1024_in_tail:
movaps -0xd8(%rax),%xmm6
movaps -0xc8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
movaps -0x88(%rax),%xmm11
movaps -0x78(%rax),%xmm12
movaps -0x68(%rax),%xmm13
movaps -0x58(%rax),%xmm14
movaps -0x48(%rax),%xmm15
___
$code.=<<___;
mov -48(%rax),%r15
.cfi_restore %r15
mov -40(%rax),%r14
.cfi_restore %r14
mov -32(%rax),%r13
.cfi_restore %r13
mov -24(%rax),%r12
.cfi_restore %r12
mov -16(%rax),%rbp
.cfi_restore %rbp
mov -8(%rax),%rbx
.cfi_restore %rbx
lea (%rax),%rsp # restore %rsp
.cfi_def_cfa_register %rsp
.Lmul_1024_epilogue:
ret
.cfi_endproc
.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
___
}
{