in crypto/curve25519-x86_64.c [781:973]
static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
/* Step 1: Compute all partial products */
" movq 0(%0), %%rdx;" /* f[0] */
" mulxq 8(%0), %%r8, %%r14;"
" xor %%r15d, %%r15d;" /* f[1]*f[0] */
" mulxq 16(%0), %%r9, %%r10;"
" adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 24(%0), %%rax, %%rcx;"
" adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 24(%0), %%rdx;" /* f[3] */
" mulxq 8(%0), %%r11, %%rbx;"
" adcx %%rcx, %%r11;" /* f[1]*f[3] */
" mulxq 16(%0), %%rax, %%r13;"
" adcx %%rax, %%rbx;" /* f[2]*f[3] */
" movq 8(%0), %%rdx;"
" adcx %%r15, %%r13;" /* f1 */
" mulxq 16(%0), %%rax, %%rcx;"
" mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
" adox %%r15, %%rbx;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
" adcx %%rbx, %%rbx;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
" movq 0(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
" movq %%rax, 0(%1);"
" add %%rcx, %%r8;"
" movq %%r8, 8(%1);"
" movq 8(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
" adcx %%rax, %%r9;"
" movq %%r9, 16(%1);"
" adcx %%rcx, %%r10;"
" movq %%r10, 24(%1);"
" movq 16(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;"
" movq %%r11, 32(%1);"
" adcx %%rcx, %%rbx;"
" movq %%rbx, 40(%1);"
" movq 24(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;"
" movq %%r13, 48(%1);"
" adcx %%rcx, %%r14;"
" movq %%r14, 56(%1);"
/* Step 1: Compute all partial products */
" movq 32(%0), %%rdx;" /* f[0] */
" mulxq 40(%0), %%r8, %%r14;"
" xor %%r15d, %%r15d;" /* f[1]*f[0] */
" mulxq 48(%0), %%r9, %%r10;"
" adcx %%r14, %%r9;" /* f[2]*f[0] */
" mulxq 56(%0), %%rax, %%rcx;"
" adcx %%rax, %%r10;" /* f[3]*f[0] */
" movq 56(%0), %%rdx;" /* f[3] */
" mulxq 40(%0), %%r11, %%rbx;"
" adcx %%rcx, %%r11;" /* f[1]*f[3] */
" mulxq 48(%0), %%rax, %%r13;"
" adcx %%rax, %%rbx;" /* f[2]*f[3] */
" movq 40(%0), %%rdx;"
" adcx %%r15, %%r13;" /* f1 */
" mulxq 48(%0), %%rax, %%rcx;"
" mov $0, %%r14;" /* f[2]*f[1] */
/* Step 2: Compute two parallel carry chains */
" xor %%r15d, %%r15d;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
" adox %%r15, %%rbx;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
" adcx %%rbx, %%rbx;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
/* Step 3: Compute intermediate squares */
" movq 32(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
" movq %%rax, 64(%1);"
" add %%rcx, %%r8;"
" movq %%r8, 72(%1);"
" movq 40(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
" adcx %%rax, %%r9;"
" movq %%r9, 80(%1);"
" adcx %%rcx, %%r10;"
" movq %%r10, 88(%1);"
" movq 48(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;"
" movq %%r11, 96(%1);"
" adcx %%rcx, %%rbx;"
" movq %%rbx, 104(%1);"
" movq 56(%0), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;"
" movq %%r13, 112(%1);"
" adcx %%rcx, %%r14;"
" movq %%r14, 120(%1);"
/* Line up pointers */
" mov %1, %0;"
" mov %2, %1;"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
" adoxq 0(%0), %%r8;"
" mulxq 40(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 8(%0), %%r9;"
" mulxq 48(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
" adoxq 16(%0), %%r10;"
" mulxq 56(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 24(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%1);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%1);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%1);"
/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 96(%0), %%r8, %%r13;"
" xor %%ecx, %%ecx;"
" adoxq 64(%0), %%r8;"
" mulxq 104(%0), %%r9, %%rbx;"
" adcx %%r13, %%r9;"
" adoxq 72(%0), %%r9;"
" mulxq 112(%0), %%r10, %%r13;"
" adcx %%rbx, %%r10;"
" adoxq 80(%0), %%r10;"
" mulxq 120(%0), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 88(%0), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
/* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 40(%1);"
" adcx %%rcx, %%r10;"
" movq %%r10, 48(%1);"
" adcx %%rcx, %%r11;"
" movq %%r11, 56(%1);"
/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 32(%1);"
: "+&r"(f), "+&r"(tmp)
: "r"(out)
: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11",
"%r13", "%r14", "%r15", "memory", "cc");
}