in ring/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl [3395:3717]
sub gen_add_affine () {
my $x = shift;
my ($src0,$sfx,$bias);
my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
$res_x,$res_y,$res_z,
$in1_x,$in1_y,$in1_z,
$in2_x,$in2_y)=map(32*$_,(0..14));
my $Z1sqr = $S2;
if ($x ne "x") {
$src0 = "%rax";
$sfx = "";
$bias = 0;
$code.=<<___;
.globl GFp_nistz256_point_add_affine
.type GFp_nistz256_point_add_affine,\@function,3
.align 32
GFp_nistz256_point_add_affine:
.cfi_startproc
___
$code.=<<___ if ($addx);
leaq GFp_ia32cap_P(%rip), %rcx
mov 8(%rcx), %rcx
and \$0x80100, %ecx
cmp \$0x80100, %ecx
je .Lpoint_add_affinex
___
} else {
$src0 = "%rdx";
$sfx = "x";
$bias = 128;
$code.=<<___;
.type GFp_nistz256_point_add_affinex,\@function,3
.align 32
GFp_nistz256_point_add_affinex:
.cfi_startproc
.Lpoint_add_affinex:
___
}
$code.=<<___;
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$32*15+8, %rsp
.cfi_adjust_cfa_offset 32*15+8
.Ladd_affine${x}_body:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
mov $b_org, $b_ptr # reassign
movdqu 0x10($a_ptr), %xmm1
movdqu 0x20($a_ptr), %xmm2
movdqu 0x30($a_ptr), %xmm3
movdqu 0x40($a_ptr), %xmm4
movdqu 0x50($a_ptr), %xmm5
mov 0x40+8*0($a_ptr), $src0 # load original in1_z
mov 0x40+8*1($a_ptr), $acc6
mov 0x40+8*2($a_ptr), $acc7
mov 0x40+8*3($a_ptr), $acc0
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
por %xmm4, %xmm5
movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($b_ptr), %xmm1
movdqu 0x20($b_ptr), %xmm2
por %xmm3, %xmm5
movdqu 0x30($b_ptr), %xmm3
movdqa %xmm0, $in2_x(%rsp)
pshufd \$0x1e, %xmm5, %xmm4
movdqa %xmm1, $in2_x+0x10(%rsp)
por %xmm0, %xmm1
movq $r_ptr, %xmm0 # save $r_ptr
movdqa %xmm2, $in2_y(%rsp)
movdqa %xmm3, $in2_y+0x10(%rsp)
por %xmm2, %xmm3
por %xmm4, %xmm5
pxor %xmm4, %xmm4
por %xmm1, %xmm3
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
lea $Z1sqr(%rsp), $r_ptr # Z1^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
pcmpeqd %xmm4, %xmm5
pshufd \$0xb1, %xmm3, %xmm4
mov 0x00($b_ptr), $src0 # $b_ptr is still valid
#lea 0x00($b_ptr), $b_ptr
mov $acc4, $acc1 # harmonize sqr output and mul input
por %xmm3, %xmm4
pshufd \$0, %xmm5, %xmm5 # in1infty
pshufd \$0x1e, %xmm4, %xmm3
mov $acc5, $acc2
por %xmm3, %xmm4
pxor %xmm3, %xmm3
mov $acc6, $acc3
pcmpeqd %xmm3, %xmm4
pshufd \$0, %xmm4, %xmm4 # in2infty
lea $Z1sqr-$bias(%rsp), $a_ptr
mov $acc7, $acc4
lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
lea $in1_x(%rsp), $b_ptr
lea $H(%rsp), $r_ptr # H = U2 - U1
call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr # S2 = Z1^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
lea $in1_y(%rsp), $b_ptr
lea $R(%rsp), $r_ptr # R = S2 - S1
call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
`&load_for_sqr("$H(%rsp)", "$src0")`
lea $Hsqr(%rsp), $r_ptr # H^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
`&load_for_sqr("$R(%rsp)", "$src0")`
lea $Rsqr(%rsp), $r_ptr # R^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
lea $Hcub(%rsp), $r_ptr # H^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
lea $U2(%rsp), $r_ptr # U1*H^2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
___
{
#######################################################################
# operate in 4-5-0-1 "name space" that matches multiplication output
#
my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
my ($poly1, $poly3)=($acc6,$acc7);
$code.=<<___;
#lea $U2(%rsp), $a_ptr
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
mov $acc0, $t0
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
sbb $poly1, $acc1
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
sbb \$0, $t4
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
lea $Hcub(%rsp), $b_ptr
lea $res_x(%rsp), $r_ptr
call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
mov $U2+8*0(%rsp), $t0
mov $U2+8*1(%rsp), $t1
mov $U2+8*2(%rsp), $t2
mov $U2+8*3(%rsp), $t3
lea $H(%rsp), $r_ptr
call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
mov $acc0, 8*0($r_ptr) # save the result, as
mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
mov $acc2, 8*2($r_ptr)
mov $acc3, 8*3($r_ptr)
___
}
$code.=<<___;
`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
lea $H(%rsp), $r_ptr
call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
lea $S2(%rsp), $b_ptr
lea $res_y(%rsp), $r_ptr
call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
movq %xmm0, $r_ptr # restore $r_ptr
movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
movdqa %xmm5, %xmm1
pandn $res_z(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_z+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand .LONE_mont(%rip), %xmm2
pand .LONE_mont+0x10(%rip), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_z(%rsp), %xmm2
pand $in1_z+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x40($r_ptr)
movdqu %xmm3, 0x50($r_ptr)
movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
movdqa %xmm5, %xmm1
pandn $res_x(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_x+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand $in2_x(%rsp), %xmm2
pand $in2_x+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_x(%rsp), %xmm2
pand $in1_x+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x00($r_ptr)
movdqu %xmm3, 0x10($r_ptr)
movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
movdqa %xmm5, %xmm1
pandn $res_y(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_y+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand $in2_y(%rsp), %xmm2
pand $in2_y+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_y(%rsp), %xmm2
pand $in1_y+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x20($r_ptr)
movdqu %xmm3, 0x30($r_ptr)
lea 32*15+56(%rsp), %rsi
.cfi_def_cfa %rsi,8
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbx
.cfi_restore %rbx
mov -8(%rsi),%rbp
.cfi_restore %rbp
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Ladd_affine${x}_epilogue:
ret
.cfi_endproc
.size GFp_nistz256_point_add_affine$sfx,.-GFp_nistz256_point_add_affine$sfx
___
}