in crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl [2995:3394]
sub gen_add () {
my $x = shift;
my ($src0,$sfx,$bias);
my ($H,$Hsqr,$R,$Rsqr,$Hcub,
$U1,$U2,$S1,$S2,
$res_x,$res_y,$res_z,
$in1_x,$in1_y,$in1_z,
$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
if ($x ne "x") {
$src0 = "%rax";
$sfx = "";
$bias = 0;
$code.=<<___;
.globl ecp_nistz256_point_add
.type ecp_nistz256_point_add,\@function,3
.align 32
ecp_nistz256_point_add:
.cfi_startproc
_CET_ENDBR
___
$code.=<<___ if ($addx);
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
leaq OPENSSL_ia32cap_P(%rip), %rcx
mov 8(%rcx), %rcx
and \$0x80100, %ecx
cmp \$0x80100, %ecx
je .Lpoint_addx
#endif
___
} else {
$src0 = "%rdx";
$sfx = "x";
$bias = 128;
$code.=<<___;
.type ecp_nistz256_point_addx,\@function,3
.align 32
ecp_nistz256_point_addx:
.cfi_startproc
.Lpoint_addx:
___
}
$code.=<<___;
push %rbp
.cfi_push %rbp
push %rbx
.cfi_push %rbx
push %r12
.cfi_push %r12
push %r13
.cfi_push %r13
push %r14
.cfi_push %r14
push %r15
.cfi_push %r15
sub \$32*18+8, %rsp
.cfi_adjust_cfa_offset 32*18+8
.Lpoint_add${x}_body:
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
movdqu 0x10($a_ptr), %xmm1
movdqu 0x20($a_ptr), %xmm2
movdqu 0x30($a_ptr), %xmm3
movdqu 0x40($a_ptr), %xmm4
movdqu 0x50($a_ptr), %xmm5
mov $a_ptr, $b_ptr # reassign
mov $b_org, $a_ptr # reassign
movdqa %xmm0, $in1_x(%rsp)
movdqa %xmm1, $in1_x+0x10(%rsp)
movdqa %xmm2, $in1_y(%rsp)
movdqa %xmm3, $in1_y+0x10(%rsp)
movdqa %xmm4, $in1_z(%rsp)
movdqa %xmm5, $in1_z+0x10(%rsp)
por %xmm4, %xmm5
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
pshufd \$0xb1, %xmm5, %xmm3
movdqu 0x10($a_ptr), %xmm1
movdqu 0x20($a_ptr), %xmm2
por %xmm3, %xmm5
movdqu 0x30($a_ptr), %xmm3
mov 0x40+8*0($a_ptr), $src0 # load original in2_z
mov 0x40+8*1($a_ptr), $acc6
mov 0x40+8*2($a_ptr), $acc7
mov 0x40+8*3($a_ptr), $acc0
movdqa %xmm0, $in2_x(%rsp)
pshufd \$0x1e, %xmm5, %xmm4
movdqa %xmm1, $in2_x+0x10(%rsp)
movdqu 0x40($a_ptr),%xmm0 # in2_z again
movdqu 0x50($a_ptr),%xmm1
movdqa %xmm2, $in2_y(%rsp)
movdqa %xmm3, $in2_y+0x10(%rsp)
por %xmm4, %xmm5
pxor %xmm4, %xmm4
por %xmm0, %xmm1
movq $r_ptr, %xmm0 # save $r_ptr
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
mov $acc6, $in2_z+8*1(%rsp)
mov $acc7, $in2_z+8*2(%rsp)
mov $acc0, $in2_z+8*3(%rsp)
lea $Z2sqr(%rsp), $r_ptr # Z2^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
pcmpeqd %xmm4, %xmm5
pshufd \$0xb1, %xmm1, %xmm4
por %xmm1, %xmm4
pshufd \$0, %xmm5, %xmm5 # in1infty
pshufd \$0x1e, %xmm4, %xmm3
por %xmm3, %xmm4
pxor %xmm3, %xmm3
pcmpeqd %xmm3, %xmm4
pshufd \$0, %xmm4, %xmm4 # in2infty
mov 0x40+8*0($b_ptr), $src0 # load original in1_z
mov 0x40+8*1($b_ptr), $acc6
mov 0x40+8*2($b_ptr), $acc7
mov 0x40+8*3($b_ptr), $acc0
movq $b_ptr, %xmm1
lea 0x40-$bias($b_ptr), $a_ptr
lea $Z1sqr(%rsp), $r_ptr # Z1^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
lea $S1(%rsp), $r_ptr # S1 = Z2^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr # S2 = Z1^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
lea $S1(%rsp), $b_ptr
lea $R(%rsp), $r_ptr # R = S2 - S1
call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
or $acc5, $acc4 # see if result is zero
movdqa %xmm4, %xmm2
or $acc0, $acc4
or $acc1, $acc4
por %xmm5, %xmm2 # in1infty || in2infty
movq $acc4, %xmm3
`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
lea $U1(%rsp), $b_ptr
lea $H(%rsp), $r_ptr # H = U2 - U1
call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
or $acc5, $acc4 # see if result is zero
or $acc0, $acc4
or $acc1, $acc4 # !is_equal(U1, U2)
movq %xmm2, $acc0
movq %xmm3, $acc1
or $acc0, $acc4
.byte 0x3e # predict taken
jnz .Ladd_proceed$x # !is_equal(U1, U2) || in1infty || in2infty
# We now know A = B or A = -B and neither is infinity. Compare the
# y-coordinates via S1 and S2.
test $acc1, $acc1
jz .Ladd_double$x # is_equal(S1, S2)
# A = -B, so the result is infinity.
#
# TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in
# which case we should eliminate this special-case and simplify the
# timing analysis.
movq %xmm0, $r_ptr # restore $r_ptr
pxor %xmm0, %xmm0
movdqu %xmm0, 0x00($r_ptr)
movdqu %xmm0, 0x10($r_ptr)
movdqu %xmm0, 0x20($r_ptr)
movdqu %xmm0, 0x30($r_ptr)
movdqu %xmm0, 0x40($r_ptr)
movdqu %xmm0, 0x50($r_ptr)
jmp .Ladd_done$x
.align 32
.Ladd_double$x:
movq %xmm1, $a_ptr # restore $a_ptr
movq %xmm0, $r_ptr # restore $r_ptr
add \$`32*(18-5)`, %rsp # difference in frame sizes
.cfi_adjust_cfa_offset `-32*(18-5)`
jmp .Lpoint_double_shortcut$x
.cfi_adjust_cfa_offset `32*(18-5)`
.align 32
.Ladd_proceed$x:
`&load_for_sqr("$R(%rsp)", "$src0")`
lea $Rsqr(%rsp), $r_ptr # R^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
`&load_for_sqr("$H(%rsp)", "$src0")`
lea $Hsqr(%rsp), $r_ptr # H^2
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
lea $Hcub(%rsp), $r_ptr # H^3
call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
lea $U2(%rsp), $r_ptr # U1*H^2
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
___
{
#######################################################################
# operate in 4-5-0-1 "name space" that matches multiplication output
#
my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
my ($poly1, $poly3)=($acc6,$acc7);
$code.=<<___;
#lea $U2(%rsp), $a_ptr
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
xor $t4, $t4
add $acc0, $acc0 # a0:a3+a0:a3
lea $Rsqr(%rsp), $a_ptr
adc $acc1, $acc1
mov $acc0, $t0
adc $acc2, $acc2
adc $acc3, $acc3
mov $acc1, $t1
adc \$0, $t4
sub \$-1, $acc0
mov $acc2, $t2
sbb $poly1, $acc1
sbb \$0, $acc2
mov $acc3, $t3
sbb $poly3, $acc3
sbb \$0, $t4
cmovc $t0, $acc0
mov 8*0($a_ptr), $t0
cmovc $t1, $acc1
mov 8*1($a_ptr), $t1
cmovc $t2, $acc2
mov 8*2($a_ptr), $t2
cmovc $t3, $acc3
mov 8*3($a_ptr), $t3
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
lea $Hcub(%rsp), $b_ptr
lea $res_x(%rsp), $r_ptr
call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
mov $U2+8*0(%rsp), $t0
mov $U2+8*1(%rsp), $t1
mov $U2+8*2(%rsp), $t2
mov $U2+8*3(%rsp), $t3
lea $res_y(%rsp), $r_ptr
call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
mov $acc0, 8*0($r_ptr) # save the result, as
mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
mov $acc2, 8*2($r_ptr)
mov $acc3, 8*3($r_ptr)
___
}
$code.=<<___;
`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
lea $S2(%rsp), $r_ptr
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
lea $res_y(%rsp), $r_ptr
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
lea $S2(%rsp), $b_ptr
lea $res_y(%rsp), $r_ptr
call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
movq %xmm0, $r_ptr # restore $r_ptr
movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
movdqa %xmm5, %xmm1
pandn $res_z(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_z+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand $in2_z(%rsp), %xmm2
pand $in2_z+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_z(%rsp), %xmm2
pand $in1_z+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x40($r_ptr)
movdqu %xmm3, 0x50($r_ptr)
movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
movdqa %xmm5, %xmm1
pandn $res_x(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_x+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand $in2_x(%rsp), %xmm2
pand $in2_x+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_x(%rsp), %xmm2
pand $in1_x+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x00($r_ptr)
movdqu %xmm3, 0x10($r_ptr)
movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
movdqa %xmm5, %xmm1
pandn $res_y(%rsp), %xmm0
movdqa %xmm5, %xmm2
pandn $res_y+0x10(%rsp), %xmm1
movdqa %xmm5, %xmm3
pand $in2_y(%rsp), %xmm2
pand $in2_y+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
movdqa %xmm4, %xmm1
pandn %xmm2, %xmm0
movdqa %xmm4, %xmm2
pandn %xmm3, %xmm1
movdqa %xmm4, %xmm3
pand $in1_y(%rsp), %xmm2
pand $in1_y+0x10(%rsp), %xmm3
por %xmm0, %xmm2
por %xmm1, %xmm3
movdqu %xmm2, 0x20($r_ptr)
movdqu %xmm3, 0x30($r_ptr)
.Ladd_done$x:
lea 32*18+56(%rsp), %rsi
.cfi_def_cfa %rsi,8
mov -48(%rsi),%r15
.cfi_restore %r15
mov -40(%rsi),%r14
.cfi_restore %r14
mov -32(%rsi),%r13
.cfi_restore %r13
mov -24(%rsi),%r12
.cfi_restore %r12
mov -16(%rsi),%rbx
.cfi_restore %rbx
mov -8(%rsi),%rbp
.cfi_restore %rbp
lea (%rsi),%rsp
.cfi_def_cfa_register %rsp
.Lpoint_add${x}_epilogue:
ret
.cfi_endproc
.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
___
}