static inline void fmul2()

in crypto/curve25519-x86_64.c [294:539]


static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
	asm volatile(

		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */

		/* Compute src1[0] * src2 */
		"  movq 0(%0), %%rdx;"
		"  mulxq 0(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  movq %%r8, 0(%2);"
		"  mulxq 8(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  movq %%r10, 8(%2);"
		"  mulxq 16(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  mulxq 24(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"

		/* Compute src1[1] * src2 */
		"  movq 8(%0), %%rdx;"
		"  mulxq 0(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 8(%2), %%r8;"
		"  movq %%r8, 8(%2);"
		"  mulxq 8(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 16(%2);"
		"  mulxq 16(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  mov $0, %%r8;"
		"  mulxq 24(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"

		/* Compute src1[2] * src2 */
		"  movq 16(%0), %%rdx;"
		"  mulxq 0(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 16(%2), %%r8;"
		"  movq %%r8, 16(%2);"
		"  mulxq 8(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 24(%2);"
		"  mulxq 16(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  mov $0, %%r8;"
		"  mulxq 24(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"

		/* Compute src1[3] * src2 */
		"  movq 24(%0), %%rdx;"
		"  mulxq 0(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 24(%2), %%r8;"
		"  movq %%r8, 24(%2);"
		"  mulxq 8(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 32(%2);"
		"  mulxq 16(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  movq %%rbx, 40(%2);"
		"  mov $0, %%r8;"
		"  mulxq 24(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  movq %%r14, 48(%2);"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"
		"  movq %%rax, 56(%2);"

		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */

		/* Compute src1[0] * src2 */
		"  movq 32(%0), %%rdx;"
		"  mulxq 32(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  movq %%r8, 64(%2);"
		"  mulxq 40(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  movq %%r10, 72(%2);"
		"  mulxq 48(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  mulxq 56(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"

		/* Compute src1[1] * src2 */
		"  movq 40(%0), %%rdx;"
		"  mulxq 32(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 72(%2), %%r8;"
		"  movq %%r8, 72(%2);"
		"  mulxq 40(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 80(%2);"
		"  mulxq 48(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  mov $0, %%r8;"
		"  mulxq 56(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"

		/* Compute src1[2] * src2 */
		"  movq 48(%0), %%rdx;"
		"  mulxq 32(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 80(%2), %%r8;"
		"  movq %%r8, 80(%2);"
		"  mulxq 40(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 88(%2);"
		"  mulxq 48(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  mov $0, %%r8;"
		"  mulxq 56(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"

		/* Compute src1[3] * src2 */
		"  movq 56(%0), %%rdx;"
		"  mulxq 32(%1), %%r8, %%r9;"
		"  xor %%r10d, %%r10d;"
		"  adcxq 88(%2), %%r8;"
		"  movq %%r8, 88(%2);"
		"  mulxq 40(%1), %%r10, %%r11;"
		"  adox %%r9, %%r10;"
		"  adcx %%rbx, %%r10;"
		"  movq %%r10, 96(%2);"
		"  mulxq 48(%1), %%rbx, %%r13;"
		"  adox %%r11, %%rbx;"
		"  adcx %%r14, %%rbx;"
		"  movq %%rbx, 104(%2);"
		"  mov $0, %%r8;"
		"  mulxq 56(%1), %%r14, %%rdx;"
		"  adox %%r13, %%r14;"
		"  adcx %%rax, %%r14;"
		"  movq %%r14, 112(%2);"
		"  mov $0, %%rax;"
		"  adox %%rdx, %%rax;"
		"  adcx %%r8, %%rax;"
		"  movq %%rax, 120(%2);"

		/* Line up pointers */
		"  mov %2, %0;"
		"  mov %3, %2;"

		/* Wrap the results back into the field */

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 32(%0), %%r8, %%r13;"
		"  xor %k1, %k1;"
		"  adoxq 0(%0), %%r8;"
		"  mulxq 40(%0), %%r9, %%rbx;"
		"  adcx %%r13, %%r9;"
		"  adoxq 8(%0), %%r9;"
		"  mulxq 48(%0), %%r10, %%r13;"
		"  adcx %%rbx, %%r10;"
		"  adoxq 16(%0), %%r10;"
		"  mulxq 56(%0), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 24(%0), %%r11;"
		"  adcx %1, %%rax;"
		"  adox %1, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %1, %%r9;"
		"  movq %%r9, 8(%2);"
		"  adcx %1, %%r10;"
		"  movq %%r10, 16(%2);"
		"  adcx %1, %%r11;"
		"  movq %%r11, 24(%2);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%2);"

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 96(%0), %%r8, %%r13;"
		"  xor %k1, %k1;"
		"  adoxq 64(%0), %%r8;"
		"  mulxq 104(%0), %%r9, %%rbx;"
		"  adcx %%r13, %%r9;"
		"  adoxq 72(%0), %%r9;"
		"  mulxq 112(%0), %%r10, %%r13;"
		"  adcx %%rbx, %%r10;"
		"  adoxq 80(%0), %%r10;"
		"  mulxq 120(%0), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 88(%0), %%r11;"
		"  adcx %1, %%rax;"
		"  adox %1, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %1, %%r9;"
		"  movq %%r9, 40(%2);"
		"  adcx %1, %%r10;"
		"  movq %%r10, 48(%2);"
		"  adcx %1, %%r11;"
		"  movq %%r11, 56(%2);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 32(%2);"
		: "+&r"(f1), "+&r"(f2), "+&r"(tmp)
		: "r"(out)
		: "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13",
		  "%r14", "memory", "cc");
}