in ring/crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl [440:769]
&sub ("esp",0x100);
&movd ("xmm7",&DWP(0,"ebp")); # b[0] -> 0000.00xy
&lea ("ebp",&DWP(4,"ebp"));
&pcmpeqd("xmm6","xmm6");
&psrlq ("xmm6",48); # compose 0xffff<<64|0xffff
&pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y
&and ("esp",-64);
&pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
&lea ("ebx",&DWP(0x80,"esp"));
&movd ("xmm0",&DWP(4*0,"esi")); # a[0] -> 0000.00xy
&pshufd ("xmm0","xmm0",0b11001100); # 0000.00xy -> 00xy.00xy
&movd ("xmm1",&DWP(4*1,"esi")); # a[1] -> ...
&movdqa (&QWP(0x00,"ebx"),"xmm0"); # offload converted a[0]
&pmuludq("xmm0","xmm7"); # a[0]*b[0]
&movd ("xmm2",&DWP(4*2,"esi"));
&pshufd ("xmm1","xmm1",0b11001100);
&movdqa (&QWP(0x10,"ebx"),"xmm1");
&pmuludq("xmm1","xmm7"); # a[1]*b[0]
&movq ("xmm4","xmm0"); # clear upper 64 bits
&pslldq("xmm4",6);
&paddq ("xmm4","xmm0");
&movdqa("xmm5","xmm4");
&psrldq("xmm4",10); # upper 32 bits of a[0]*b[0]
&pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[0]
# Upper half of a[0]*b[i] is carried into next multiplication
# iteration, while lower one "participates" in actual reduction.
# Normally latter is done by accumulating result of multiplication
# of modulus by "magic" digit, but thanks to special form of modulus
# and "magic" digit it can be performed only with additions and
# subtractions (see note in IALU section below). Note that we are
# not bothered with carry bits, they are accumulated in "flatten"
# phase after all multiplications and reductions.
&movd ("xmm3",&DWP(4*3,"esi"));
&pshufd ("xmm2","xmm2",0b11001100);
&movdqa (&QWP(0x20,"ebx"),"xmm2");
&pmuludq("xmm2","xmm7"); # a[2]*b[0]
&paddq ("xmm1","xmm4"); # a[1]*b[0]+hw(a[0]*b[0]), carry
&movdqa (&QWP(0x00,"esp"),"xmm1"); # t[0]
&movd ("xmm0",&DWP(4*4,"esi"));
&pshufd ("xmm3","xmm3",0b11001100);
&movdqa (&QWP(0x30,"ebx"),"xmm3");
&pmuludq("xmm3","xmm7"); # a[3]*b[0]
&movdqa (&QWP(0x10,"esp"),"xmm2");
&movd ("xmm1",&DWP(4*5,"esi"));
&pshufd ("xmm0","xmm0",0b11001100);
&movdqa (&QWP(0x40,"ebx"),"xmm0");
&pmuludq("xmm0","xmm7"); # a[4]*b[0]
&paddq ("xmm3","xmm5"); # a[3]*b[0]+lw(a[0]*b[0]), reduction step
&movdqa (&QWP(0x20,"esp"),"xmm3");
&movd ("xmm2",&DWP(4*6,"esi"));
&pshufd ("xmm1","xmm1",0b11001100);
&movdqa (&QWP(0x50,"ebx"),"xmm1");
&pmuludq("xmm1","xmm7"); # a[5]*b[0]
&movdqa (&QWP(0x30,"esp"),"xmm0");
&pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
&movd ("xmm3",&DWP(4*7,"esi"));
&pshufd ("xmm2","xmm2",0b11001100);
&movdqa (&QWP(0x60,"ebx"),"xmm2");
&pmuludq("xmm2","xmm7"); # a[6]*b[0]
&movdqa (&QWP(0x40,"esp"),"xmm1");
&psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
&movd ("xmm0",&DWP(0,"ebp")); # b[1] -> 0000.00xy
&pshufd ("xmm3","xmm3",0b11001100);
&movdqa (&QWP(0x70,"ebx"),"xmm3");
&pmuludq("xmm3","xmm7"); # a[7]*b[0]
&pshuflw("xmm7","xmm0",0b11011100); # 0000.00xy -> 0000.0x0y
&movdqa ("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0]
&pshufd ("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
&mov ("ecx",6);
&lea ("ebp",&DWP(4,"ebp"));
&jmp (&label("madd_sse2"));
&set_label("madd_sse2",16);
&paddq ("xmm2","xmm5"); # a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled]
&paddq ("xmm3","xmm4"); # a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled]
&movdqa ("xmm1",&QWP(0x10,"ebx"));
&pmuludq("xmm0","xmm7"); # a[0]*b[i]
&movdqa(&QWP(0x50,"esp"),"xmm2");
&movdqa ("xmm2",&QWP(0x20,"ebx"));
&pmuludq("xmm1","xmm7"); # a[1]*b[i]
&movdqa(&QWP(0x60,"esp"),"xmm3");
&paddq ("xmm0",&QWP(0x00,"esp"));
&movdqa ("xmm3",&QWP(0x30,"ebx"));
&pmuludq("xmm2","xmm7"); # a[2]*b[i]
&movq ("xmm4","xmm0"); # clear upper 64 bits
&pslldq("xmm4",6);
&paddq ("xmm1",&QWP(0x10,"esp"));
&paddq ("xmm4","xmm0");
&movdqa("xmm5","xmm4");
&psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0]
&movdqa ("xmm0",&QWP(0x40,"ebx"));
&pmuludq("xmm3","xmm7"); # a[3]*b[i]
&paddq ("xmm1","xmm4"); # a[1]*b[i]+hw(a[0]*b[i]), carry
&paddq ("xmm2",&QWP(0x20,"esp"));
&movdqa (&QWP(0x00,"esp"),"xmm1");
&movdqa ("xmm1",&QWP(0x50,"ebx"));
&pmuludq("xmm0","xmm7"); # a[4]*b[i]
&paddq ("xmm3",&QWP(0x30,"esp"));
&movdqa (&QWP(0x10,"esp"),"xmm2");
&pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i]
&movdqa ("xmm2",&QWP(0x60,"ebx"));
&pmuludq("xmm1","xmm7"); # a[5]*b[i]
&paddq ("xmm3","xmm5"); # a[3]*b[i]+lw(a[0]*b[i]), reduction step
&paddq ("xmm0",&QWP(0x40,"esp"));
&movdqa (&QWP(0x20,"esp"),"xmm3");
&pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
&movdqa ("xmm3","xmm7");
&pmuludq("xmm2","xmm7"); # a[6]*b[i]
&movd ("xmm7",&DWP(0,"ebp")); # b[i++] -> 0000.00xy
&lea ("ebp",&DWP(4,"ebp"));
&paddq ("xmm1",&QWP(0x50,"esp"));
&psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
&movdqa (&QWP(0x30,"esp"),"xmm0");
&pshuflw("xmm7","xmm7",0b11011100); # 0000.00xy -> 0000.0x0y
&pmuludq("xmm3",&QWP(0x70,"ebx")); # a[7]*b[i]
&pshufd("xmm7","xmm7",0b11011100); # 0000.0x0y -> 000x.000y
&movdqa("xmm0",&QWP(0x00,"ebx")); # pre-load converted a[0]
&movdqa (&QWP(0x40,"esp"),"xmm1");
&paddq ("xmm2",&QWP(0x60,"esp"));
&dec ("ecx");
&jnz (&label("madd_sse2"));
&paddq ("xmm2","xmm5"); # a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled]
&paddq ("xmm3","xmm4"); # a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled]
&movdqa ("xmm1",&QWP(0x10,"ebx"));
&pmuludq("xmm0","xmm7"); # a[0]*b[7]
&movdqa(&QWP(0x50,"esp"),"xmm2");
&movdqa ("xmm2",&QWP(0x20,"ebx"));
&pmuludq("xmm1","xmm7"); # a[1]*b[7]
&movdqa(&QWP(0x60,"esp"),"xmm3");
&paddq ("xmm0",&QWP(0x00,"esp"));
&movdqa ("xmm3",&QWP(0x30,"ebx"));
&pmuludq("xmm2","xmm7"); # a[2]*b[7]
&movq ("xmm4","xmm0"); # clear upper 64 bits
&pslldq("xmm4",6);
&paddq ("xmm1",&QWP(0x10,"esp"));
&paddq ("xmm4","xmm0");
&movdqa("xmm5","xmm4");
&psrldq("xmm4",10); # upper 33 bits of a[0]*b[i]+t[0]
&movdqa ("xmm0",&QWP(0x40,"ebx"));
&pmuludq("xmm3","xmm7"); # a[3]*b[7]
&paddq ("xmm1","xmm4"); # a[1]*b[7]+hw(a[0]*b[7]), carry
&paddq ("xmm2",&QWP(0x20,"esp"));
&movdqa (&QWP(0x00,"esp"),"xmm1");
&movdqa ("xmm1",&QWP(0x50,"ebx"));
&pmuludq("xmm0","xmm7"); # a[4]*b[7]
&paddq ("xmm3",&QWP(0x30,"esp"));
&movdqa (&QWP(0x10,"esp"),"xmm2");
&pand ("xmm5","xmm6"); # lower 32 bits of a[0]*b[i]
&movdqa ("xmm2",&QWP(0x60,"ebx"));
&pmuludq("xmm1","xmm7"); # a[5]*b[7]
&paddq ("xmm3","xmm5"); # reduction step
&paddq ("xmm0",&QWP(0x40,"esp"));
&movdqa (&QWP(0x20,"esp"),"xmm3");
&pshufd("xmm4","xmm5",0b10110001); # xmm4 = xmm5<<32, reduction step
&movdqa ("xmm3",&QWP(0x70,"ebx"));
&pmuludq("xmm2","xmm7"); # a[6]*b[7]
&paddq ("xmm1",&QWP(0x50,"esp"));
&psubq ("xmm4","xmm5"); # xmm4 = xmm5*0xffffffff, reduction step
&movdqa (&QWP(0x30,"esp"),"xmm0");
&pmuludq("xmm3","xmm7"); # a[7]*b[7]
&pcmpeqd("xmm7","xmm7");
&movdqa ("xmm0",&QWP(0x00,"esp"));
&pslldq ("xmm7",8);
&movdqa (&QWP(0x40,"esp"),"xmm1");
&paddq ("xmm2",&QWP(0x60,"esp"));
&paddq ("xmm2","xmm5"); # a[6]*b[7]+lw(a[0]*b[7]), reduction step
&paddq ("xmm3","xmm4"); # a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step
&movdqa(&QWP(0x50,"esp"),"xmm2");
&movdqa(&QWP(0x60,"esp"),"xmm3");
&movdqa ("xmm1",&QWP(0x10,"esp"));
&movdqa ("xmm2",&QWP(0x20,"esp"));
&movdqa ("xmm3",&QWP(0x30,"esp"));
&movq ("xmm4","xmm0"); # "flatten"
&pand ("xmm0","xmm7");
&xor ("ebp","ebp");
&pslldq ("xmm4",6);
&movq ("xmm5","xmm1");
&paddq ("xmm0","xmm4");
&pand ("xmm1","xmm7");
&psrldq ("xmm0",6);
&movd ("eax","xmm0");
&psrldq ("xmm0",4);
&paddq ("xmm5","xmm0");
&movdqa ("xmm0",&QWP(0x40,"esp"));
&sub ("eax",-1); # start subtracting modulus,
# this is used to determine
# if result is larger/smaller
# than modulus (see below)
&pslldq ("xmm5",6);
&movq ("xmm4","xmm2");
&paddq ("xmm1","xmm5");
&pand ("xmm2","xmm7");
&psrldq ("xmm1",6);
&mov (&DWP(4*0,"edi"),"eax");
&movd ("eax","xmm1");
&psrldq ("xmm1",4);
&paddq ("xmm4","xmm1");
&movdqa ("xmm1",&QWP(0x50,"esp"));
&sbb ("eax",-1);
&pslldq ("xmm4",6);
&movq ("xmm5","xmm3");
&paddq ("xmm2","xmm4");
&pand ("xmm3","xmm7");
&psrldq ("xmm2",6);
&mov (&DWP(4*1,"edi"),"eax");
&movd ("eax","xmm2");
&psrldq ("xmm2",4);
&paddq ("xmm5","xmm2");
&movdqa ("xmm2",&QWP(0x60,"esp"));
&sbb ("eax",-1);
&pslldq ("xmm5",6);
&movq ("xmm4","xmm0");
&paddq ("xmm3","xmm5");
&pand ("xmm0","xmm7");
&psrldq ("xmm3",6);
&mov (&DWP(4*2,"edi"),"eax");
&movd ("eax","xmm3");
&psrldq ("xmm3",4);
&paddq ("xmm4","xmm3");
&sbb ("eax",0);
&pslldq ("xmm4",6);
&movq ("xmm5","xmm1");
&paddq ("xmm0","xmm4");
&pand ("xmm1","xmm7");
&psrldq ("xmm0",6);
&mov (&DWP(4*3,"edi"),"eax");
&movd ("eax","xmm0");
&psrldq ("xmm0",4);
&paddq ("xmm5","xmm0");
&sbb ("eax",0);
&pslldq ("xmm5",6);
&movq ("xmm4","xmm2");
&paddq ("xmm1","xmm5");
&pand ("xmm2","xmm7");
&psrldq ("xmm1",6);
&movd ("ebx","xmm1");
&psrldq ("xmm1",4);
&mov ("esp","edx");
&paddq ("xmm4","xmm1");
&pslldq ("xmm4",6);
&paddq ("xmm2","xmm4");
&psrldq ("xmm2",6);
&movd ("ecx","xmm2");
&psrldq ("xmm2",4);
&sbb ("ebx",0);
&movd ("edx","xmm2");
&pextrw ("esi","xmm2",2); # top-most overflow bit
&sbb ("ecx",1);
&sbb ("edx",-1);
&sbb ("esi",0); # borrow from subtraction
# Final step is "if result > mod, subtract mod", and at this point
# we have result - mod written to output buffer, as well as borrow
# bit from this subtraction, and if borrow bit is set, we add
# modulus back.
#
# Note that because mod has special form, i.e. consists of
# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
# assigning borrow bit to one register, %ebp, and its negative
# to another, %esi. But we started by calculating %esi...
&sub ("ebp","esi");
&add (&DWP(4*0,"edi"),"esi"); # add modulus or zero
&adc (&DWP(4*1,"edi"),"esi");
&adc (&DWP(4*2,"edi"),"esi");
&adc (&DWP(4*3,"edi"),0);
&adc ("eax",0);
&adc ("ebx",0);
&mov (&DWP(4*4,"edi"),"eax");
&adc ("ecx","ebp");
&mov (&DWP(4*5,"edi"),"ebx");
&adc ("edx","esi");
&mov (&DWP(4*6,"edi"),"ecx");
&mov (&DWP(4*7,"edi"),"edx");
&ret ();
} # Non-SSE2 code removed.
&function_end_B("_ecp_nistz256_mul_mont");
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
&static_label("point_double_shortcut");
&function_begin("GFp_nistz256_point_double");
{ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));