&sub()

in ring/crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl [440:769]


	&sub	("esp",0x100);

	&movd	("xmm7",&DWP(0,"ebp"));		# b[0] -> 0000.00xy
	&lea	("ebp",&DWP(4,"ebp"));
	&pcmpeqd("xmm6","xmm6");
	&psrlq	("xmm6",48);			# compose 0xffff<<64|0xffff

	&pshuflw("xmm7","xmm7",0b11011100);	# 0000.00xy -> 0000.0x0y
	&and	("esp",-64);
	&pshufd	("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y
	&lea	("ebx",&DWP(0x80,"esp"));

	&movd	("xmm0",&DWP(4*0,"esi"));	# a[0] -> 0000.00xy
	&pshufd	("xmm0","xmm0",0b11001100);	# 0000.00xy -> 00xy.00xy
	&movd	("xmm1",&DWP(4*1,"esi"));	# a[1] -> ...
	&movdqa	(&QWP(0x00,"ebx"),"xmm0");	# offload converted a[0]
	&pmuludq("xmm0","xmm7");		# a[0]*b[0]

	&movd	("xmm2",&DWP(4*2,"esi"));
	&pshufd	("xmm1","xmm1",0b11001100);
	&movdqa	(&QWP(0x10,"ebx"),"xmm1");
	&pmuludq("xmm1","xmm7");		# a[1]*b[0]

	 &movq	("xmm4","xmm0");		# clear upper 64 bits
	 &pslldq("xmm4",6);
	 &paddq	("xmm4","xmm0");
	 &movdqa("xmm5","xmm4");
	 &psrldq("xmm4",10);			# upper 32 bits of a[0]*b[0]
	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[0]

	# Upper half of a[0]*b[i] is carried into next multiplication
	# iteration, while lower one "participates" in actual reduction.
	# Normally latter is done by accumulating result of multiplication
	# of modulus by "magic" digit, but thanks to special form of modulus
	# and "magic" digit it can be performed only with additions and
	# subtractions (see note in IALU section below). Note that we are
	# not bothered with carry bits, they are accumulated in "flatten"
	# phase after all multiplications and reductions.

	&movd	("xmm3",&DWP(4*3,"esi"));
	&pshufd	("xmm2","xmm2",0b11001100);
	&movdqa	(&QWP(0x20,"ebx"),"xmm2");
	&pmuludq("xmm2","xmm7");		# a[2]*b[0]
	 &paddq	("xmm1","xmm4");		# a[1]*b[0]+hw(a[0]*b[0]), carry
	&movdqa	(&QWP(0x00,"esp"),"xmm1");	# t[0]

	&movd	("xmm0",&DWP(4*4,"esi"));
	&pshufd	("xmm3","xmm3",0b11001100);
	&movdqa	(&QWP(0x30,"ebx"),"xmm3");
	&pmuludq("xmm3","xmm7");		# a[3]*b[0]
	&movdqa	(&QWP(0x10,"esp"),"xmm2");

	&movd	("xmm1",&DWP(4*5,"esi"));
	&pshufd	("xmm0","xmm0",0b11001100);
	&movdqa	(&QWP(0x40,"ebx"),"xmm0");
	&pmuludq("xmm0","xmm7");		# a[4]*b[0]
	 &paddq	("xmm3","xmm5");		# a[3]*b[0]+lw(a[0]*b[0]), reduction step
	&movdqa	(&QWP(0x20,"esp"),"xmm3");

	&movd	("xmm2",&DWP(4*6,"esi"));
	&pshufd	("xmm1","xmm1",0b11001100);
	&movdqa	(&QWP(0x50,"ebx"),"xmm1");
	&pmuludq("xmm1","xmm7");		# a[5]*b[0]
	&movdqa	(&QWP(0x30,"esp"),"xmm0");
	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step

	&movd	("xmm3",&DWP(4*7,"esi"));
	&pshufd	("xmm2","xmm2",0b11001100);
	&movdqa	(&QWP(0x60,"ebx"),"xmm2");
	&pmuludq("xmm2","xmm7");		# a[6]*b[0]
	&movdqa	(&QWP(0x40,"esp"),"xmm1");
	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step

	&movd	("xmm0",&DWP(0,"ebp"));		# b[1] -> 0000.00xy
	&pshufd	("xmm3","xmm3",0b11001100);
	&movdqa	(&QWP(0x70,"ebx"),"xmm3");
	&pmuludq("xmm3","xmm7");		# a[7]*b[0]

	&pshuflw("xmm7","xmm0",0b11011100);	# 0000.00xy -> 0000.0x0y
	&movdqa	("xmm0",&QWP(0x00,"ebx"));	# pre-load converted a[0]
	&pshufd	("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y

	&mov	("ecx",6);
	&lea	("ebp",&DWP(4,"ebp"));
	&jmp	(&label("madd_sse2"));

&set_label("madd_sse2",16);
	 &paddq	("xmm2","xmm5");		# a[6]*b[i-1]+lw(a[0]*b[i-1]), reduction step [modulo-scheduled]
	 &paddq	("xmm3","xmm4");		# a[7]*b[i-1]+lw(a[0]*b[i-1])*0xffffffff, reduction step [modulo-scheduled]
	&movdqa	("xmm1",&QWP(0x10,"ebx"));
	&pmuludq("xmm0","xmm7");		# a[0]*b[i]
	 &movdqa(&QWP(0x50,"esp"),"xmm2");

	&movdqa	("xmm2",&QWP(0x20,"ebx"));
	&pmuludq("xmm1","xmm7");		# a[1]*b[i]
	 &movdqa(&QWP(0x60,"esp"),"xmm3");
	&paddq	("xmm0",&QWP(0x00,"esp"));

	&movdqa	("xmm3",&QWP(0x30,"ebx"));
	&pmuludq("xmm2","xmm7");		# a[2]*b[i]
	 &movq	("xmm4","xmm0");		# clear upper 64 bits
	 &pslldq("xmm4",6);
	&paddq	("xmm1",&QWP(0x10,"esp"));
	 &paddq	("xmm4","xmm0");
	 &movdqa("xmm5","xmm4");
	 &psrldq("xmm4",10);			# upper 33 bits of a[0]*b[i]+t[0]

	&movdqa	("xmm0",&QWP(0x40,"ebx"));
	&pmuludq("xmm3","xmm7");		# a[3]*b[i]
	 &paddq	("xmm1","xmm4");		# a[1]*b[i]+hw(a[0]*b[i]), carry
	&paddq	("xmm2",&QWP(0x20,"esp"));
	&movdqa	(&QWP(0x00,"esp"),"xmm1");

	&movdqa	("xmm1",&QWP(0x50,"ebx"));
	&pmuludq("xmm0","xmm7");		# a[4]*b[i]
	&paddq	("xmm3",&QWP(0x30,"esp"));
	&movdqa	(&QWP(0x10,"esp"),"xmm2");
	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[i]

	&movdqa	("xmm2",&QWP(0x60,"ebx"));
	&pmuludq("xmm1","xmm7");		# a[5]*b[i]
	 &paddq	("xmm3","xmm5");		# a[3]*b[i]+lw(a[0]*b[i]), reduction step
	&paddq	("xmm0",&QWP(0x40,"esp"));
	&movdqa	(&QWP(0x20,"esp"),"xmm3");
	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step

	&movdqa	("xmm3","xmm7");
	&pmuludq("xmm2","xmm7");		# a[6]*b[i]
	 &movd	("xmm7",&DWP(0,"ebp"));		# b[i++] -> 0000.00xy
	 &lea	("ebp",&DWP(4,"ebp"));
	&paddq	("xmm1",&QWP(0x50,"esp"));
	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step
	&movdqa	(&QWP(0x30,"esp"),"xmm0");
	 &pshuflw("xmm7","xmm7",0b11011100);	# 0000.00xy -> 0000.0x0y

	&pmuludq("xmm3",&QWP(0x70,"ebx"));	# a[7]*b[i]
	 &pshufd("xmm7","xmm7",0b11011100);	# 0000.0x0y -> 000x.000y
	 &movdqa("xmm0",&QWP(0x00,"ebx"));	# pre-load converted a[0]
	&movdqa	(&QWP(0x40,"esp"),"xmm1");
	&paddq	("xmm2",&QWP(0x60,"esp"));

	&dec	("ecx");
	&jnz	(&label("madd_sse2"));

	 &paddq	("xmm2","xmm5");		# a[6]*b[6]+lw(a[0]*b[6]), reduction step [modulo-scheduled]
	 &paddq	("xmm3","xmm4");		# a[7]*b[6]+lw(a[0]*b[6])*0xffffffff, reduction step [modulo-scheduled]
	&movdqa	("xmm1",&QWP(0x10,"ebx"));
	&pmuludq("xmm0","xmm7");		# a[0]*b[7]
	 &movdqa(&QWP(0x50,"esp"),"xmm2");

	&movdqa	("xmm2",&QWP(0x20,"ebx"));
	&pmuludq("xmm1","xmm7");		# a[1]*b[7]
	 &movdqa(&QWP(0x60,"esp"),"xmm3");
	&paddq	("xmm0",&QWP(0x00,"esp"));

	&movdqa	("xmm3",&QWP(0x30,"ebx"));
	&pmuludq("xmm2","xmm7");		# a[2]*b[7]
	 &movq	("xmm4","xmm0");		# clear upper 64 bits
	 &pslldq("xmm4",6);
	&paddq	("xmm1",&QWP(0x10,"esp"));
	 &paddq	("xmm4","xmm0");
	 &movdqa("xmm5","xmm4");
	 &psrldq("xmm4",10);			# upper 33 bits of a[0]*b[i]+t[0]

	&movdqa	("xmm0",&QWP(0x40,"ebx"));
	&pmuludq("xmm3","xmm7");		# a[3]*b[7]
	 &paddq	("xmm1","xmm4");		# a[1]*b[7]+hw(a[0]*b[7]), carry
	&paddq	("xmm2",&QWP(0x20,"esp"));
	&movdqa	(&QWP(0x00,"esp"),"xmm1");

	&movdqa	("xmm1",&QWP(0x50,"ebx"));
	&pmuludq("xmm0","xmm7");		# a[4]*b[7]
	&paddq	("xmm3",&QWP(0x30,"esp"));
	&movdqa	(&QWP(0x10,"esp"),"xmm2");
	 &pand	("xmm5","xmm6");		# lower 32 bits of a[0]*b[i]

	&movdqa	("xmm2",&QWP(0x60,"ebx"));
	&pmuludq("xmm1","xmm7");		# a[5]*b[7]
	 &paddq	("xmm3","xmm5");		# reduction step
	&paddq	("xmm0",&QWP(0x40,"esp"));
	&movdqa	(&QWP(0x20,"esp"),"xmm3");
	 &pshufd("xmm4","xmm5",0b10110001);	# xmm4 = xmm5<<32, reduction step

	&movdqa	("xmm3",&QWP(0x70,"ebx"));
	&pmuludq("xmm2","xmm7");		# a[6]*b[7]
	&paddq	("xmm1",&QWP(0x50,"esp"));
	 &psubq	("xmm4","xmm5");		# xmm4 = xmm5*0xffffffff, reduction step
	&movdqa	(&QWP(0x30,"esp"),"xmm0");

	&pmuludq("xmm3","xmm7");		# a[7]*b[7]
	&pcmpeqd("xmm7","xmm7");
	&movdqa	("xmm0",&QWP(0x00,"esp"));
	&pslldq	("xmm7",8);
	&movdqa	(&QWP(0x40,"esp"),"xmm1");
	&paddq	("xmm2",&QWP(0x60,"esp"));

	 &paddq	("xmm2","xmm5");		# a[6]*b[7]+lw(a[0]*b[7]), reduction step
	 &paddq	("xmm3","xmm4");		# a[6]*b[7]+lw(a[0]*b[7])*0xffffffff, reduction step
	 &movdqa(&QWP(0x50,"esp"),"xmm2");
	 &movdqa(&QWP(0x60,"esp"),"xmm3");

	&movdqa	("xmm1",&QWP(0x10,"esp"));
	&movdqa	("xmm2",&QWP(0x20,"esp"));
	&movdqa	("xmm3",&QWP(0x30,"esp"));

	&movq	("xmm4","xmm0");		# "flatten"
	&pand	("xmm0","xmm7");
	&xor	("ebp","ebp");
	&pslldq	("xmm4",6);
	 &movq	("xmm5","xmm1");
	&paddq	("xmm0","xmm4");
	 &pand	("xmm1","xmm7");
	&psrldq	("xmm0",6);
	&movd	("eax","xmm0");
	&psrldq	("xmm0",4);

	&paddq	("xmm5","xmm0");
	&movdqa	("xmm0",&QWP(0x40,"esp"));
	&sub	("eax",-1);			# start subtracting modulus,
						# this is used to determine
						# if result is larger/smaller
						# than modulus (see below)
	&pslldq	("xmm5",6);
	 &movq	("xmm4","xmm2");
	&paddq	("xmm1","xmm5");
	 &pand	("xmm2","xmm7");
	&psrldq	("xmm1",6);
	&mov	(&DWP(4*0,"edi"),"eax");
	&movd	("eax","xmm1");
	&psrldq	("xmm1",4);

	&paddq	("xmm4","xmm1");
	&movdqa	("xmm1",&QWP(0x50,"esp"));
	&sbb	("eax",-1);
	&pslldq	("xmm4",6);
	 &movq	("xmm5","xmm3");
	&paddq	("xmm2","xmm4");
	 &pand	("xmm3","xmm7");
	&psrldq	("xmm2",6);
	&mov	(&DWP(4*1,"edi"),"eax");
	&movd	("eax","xmm2");
	&psrldq	("xmm2",4);

	&paddq	("xmm5","xmm2");
	&movdqa	("xmm2",&QWP(0x60,"esp"));
	&sbb	("eax",-1);
	&pslldq	("xmm5",6);
	 &movq	("xmm4","xmm0");
	&paddq	("xmm3","xmm5");
	 &pand	("xmm0","xmm7");
	&psrldq	("xmm3",6);
	&mov	(&DWP(4*2,"edi"),"eax");
	&movd	("eax","xmm3");
	&psrldq	("xmm3",4);

	&paddq	("xmm4","xmm3");
	&sbb	("eax",0);
	&pslldq	("xmm4",6);
	 &movq	("xmm5","xmm1");
	&paddq	("xmm0","xmm4");
	 &pand	("xmm1","xmm7");
	&psrldq	("xmm0",6);
	&mov	(&DWP(4*3,"edi"),"eax");
	&movd	("eax","xmm0");
	&psrldq	("xmm0",4);

	&paddq	("xmm5","xmm0");
	&sbb	("eax",0);
	&pslldq	("xmm5",6);
	 &movq	("xmm4","xmm2");
	&paddq	("xmm1","xmm5");
	 &pand	("xmm2","xmm7");
	&psrldq	("xmm1",6);
	&movd	("ebx","xmm1");
	&psrldq	("xmm1",4);
	&mov	("esp","edx");

	&paddq	("xmm4","xmm1");
	&pslldq	("xmm4",6);
	&paddq	("xmm2","xmm4");
	&psrldq	("xmm2",6);
	&movd	("ecx","xmm2");
	&psrldq	("xmm2",4);
	&sbb	("ebx",0);
	&movd	("edx","xmm2");
	&pextrw	("esi","xmm2",2);		# top-most overflow bit
	&sbb	("ecx",1);
	&sbb	("edx",-1);
	&sbb	("esi",0);			# borrow from subtraction

	# Final step is "if result > mod, subtract mod", and at this point
	# we have result - mod written to output buffer, as well as borrow
	# bit from this subtraction, and if borrow bit is set, we add
	# modulus back.
	#
	# Note that because mod has special form, i.e. consists of
	# 0xffffffff, 1 and 0s, we can conditionally synthesize it by
	# assigning borrow bit to one register, %ebp, and its negative
	# to another, %esi. But we started by calculating %esi...

	&sub	("ebp","esi");
	&add	(&DWP(4*0,"edi"),"esi");	# add modulus or zero
	&adc	(&DWP(4*1,"edi"),"esi");
	&adc	(&DWP(4*2,"edi"),"esi");
	&adc	(&DWP(4*3,"edi"),0);
	&adc	("eax",0);
	&adc	("ebx",0);
	&mov	(&DWP(4*4,"edi"),"eax");
	&adc	("ecx","ebp");
	&mov	(&DWP(4*5,"edi"),"ebx");
	&adc	("edx","esi");
	&mov	(&DWP(4*6,"edi"),"ecx");
	&mov	(&DWP(4*7,"edi"),"edx");

	&ret	();

}	# Non-SSE2 code removed.

&function_end_B("_ecp_nistz256_mul_mont");

########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void GFp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
&static_label("point_double_shortcut");
&function_begin("GFp_nistz256_point_double");
{   my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));