in FourQ_ARM_NEON/ARM/fp2_1271_NEON.c [1530:1667]
void mul_truncate_a(uint32_t* a, uint32_t* b, uint32_t* c)
{ // 256-bit multiplication with truncation for the scalar decomposition
// Outputs 64-bit value c = (uint64_t)((a * b) >> 256).
asm volatile(
"push {r4-r12} \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #0] \n\t"
"mov r8, #0 \n\t"
"sub r13, r13, #12 \n\t" // Allocating space in the stack
"umull r7, r6, r5, r3 \n\t"
"umlal r6, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"mov r9, #0 \n\t"
"mov r10, #0 \n\t"
"str r6, [r13], #4 \n\t" // Store in stack
"umlal r8, r9, r5, r3 \n\t"
"umlal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"mov r11, #0 \n\t"
"mov r12, #0 \n\t"
"umlal r10, r11, r5, r3 \n\t"
"umlal r11, r12, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"mov r6, #0 \n\t"
"mov r7, #0 \n\t"
"sub %0, %0, #32 \n\t"
"umlal r12, r6, r5, r3 \n\t"
"umlal r6, r7, r5, r4 \n\t"
"stm r13, {r6-r7} \n\t" // Store in stack
"sub r13, r13, #4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #4] \n\t"
"ldr r6, [r13] \n\t"
"mov r7, #0 \n\t"
"umlal r6, r7, r5, r3 \n\t"
"umaal r7, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"str r7, [r13], #4 \n\t" // Store in stack
"umaal r8, r9, r5, r3 \n\t"
"umaal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"umaal r10, r11, r5, r3 \n\t"
"umaal r11, r12, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldm r13, {r6-r7} \n\t"
"sub %0, %0, #32 \n\t"
"umaal r12, r6, r5, r3 \n\t"
"umaal r6, r7, r5, r4 \n\t"
"stm r13, {r6-r7} \n\t" // Store in stack
"sub r13, r13, #4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #8] \n\t"
"ldr r7, [r13] \n\t"
"mov r6, #0 \n\t"
"umlal r7, r6, r5, r3 \n\t"
"umaal r6, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"str r6, [r13], #4 \n\t" // Store in stack
"umaal r8, r9, r5, r3 \n\t"
"umaal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"umaal r10, r11, r5, r3 \n\t"
"umaal r11, r12, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldm r13, {r6-r7} \n\t"
"sub %0, %0, #32 \n\t"
"umaal r12, r6, r5, r3 \n\t"
"umaal r6, r7, r5, r4 \n\t"
"stm r13, {r6-r7} \n\t" // Store in stack
"sub r13, r13, #4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #12] \n\t"
"ldr r6, [r13] \n\t"
"mov r7, #0 \n\t"
"umlal r6, r7, r5, r3 \n\t"
"umaal r7, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"str r7, [r13], #4 \n\t" // Store in stack
"umaal r8, r9, r5, r3 \n\t"
"umaal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"umaal r10, r11, r5, r3 \n\t"
"umaal r11, r12, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldm r13, {r6-r7} \n\t"
"sub %0, %0, #32 \n\t"
"umaal r12, r6, r5, r3 \n\t"
"sub r13, r13, #4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #16] \n\t"
"ldr r7, [r13] \n\t"
"mov r6, #0 \n\t"
"umlal r7, r6, r5, r3 \n\t"
"umaal r6, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"str r6, [r13] \n\t" // Store in stack
"umaal r8, r9, r5, r3 \n\t"
"umaal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"sub %0, %0, #24 \n\t"
"umaal r10, r11, r5, r3 \n\t"
"umaal r11, r12, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #20] \n\t"
"ldr r6, [r13], #12 \n\t"
"mov r7, #0 \n\t"
"umlal r6, r7, r5, r3 \n\t"
"umaal r7, r8, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"umaal r8, r9, r5, r3 \n\t"
"umaal r9, r10, r5, r4 \n\t"
"ldm %0!, {r3-r4} \n\t"
"sub %0, %0, #24 \n\t"
"umaal r10, r11, r5, r3 \n\t"
"ldm %0!, {r3-r4} \n\t"
"ldr r5, [%1, #24] \n\t"
"ldm %0!, {r11-r12} \n\t"
"mov r6, #0 \n\t"
"umlal r7, r6, r5, r3 \n\t"
"umaal r6, r8, r5, r4 \n\t"
"umaal r8, r9, r5, r11 \n\t"
"umaal r9, r10, r5, r12 \n\t"
"stm %2!, {r8-r9} \n\t"
"pop {r4-r12} \n\t"
:
:"r"(&a[0]), "r"(&b[0]), "r"(&c[0])
:"memory","r3","r4","r5","r6","r7","r8","r9","r10","r11","r12"
);
return;
}