static inline float inner_product_single()

in subprojects/speex/resample_neon.h [142:200]


static inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
    float ret;
    uint32_t remainder = len % 16;
    len = len - remainder;

    asm volatile ("	 cmp %[len], #0\n"
		  "	 bne 1f\n"
		  "	 vld1.32 {q4}, [%[b]]!\n"
		  "	 vld1.32 {q8}, [%[a]]!\n"
		  "	 subs %[remainder], %[remainder], #4\n"
		  "	 vmul.f32 q0, q4, q8\n"
		  "      bne 4f\n"
		  "	 b 5f\n"
		  "1:"
		  "	 vld1.32 {q4, q5}, [%[b]]!\n"
		  "	 vld1.32 {q8, q9}, [%[a]]!\n"
		  "	 vld1.32 {q6, q7}, [%[b]]!\n"
		  "	 vld1.32 {q10, q11}, [%[a]]!\n"
		  "	 subs %[len], %[len], #16\n"
		  "	 vmul.f32 q0, q4, q8\n"
		  "	 vmul.f32 q1, q5, q9\n"
		  "	 vmul.f32 q2, q6, q10\n"
		  "	 vmul.f32 q3, q7, q11\n"
		  "	 beq 3f\n"
		  "2:"
		  "	 vld1.32 {q4, q5}, [%[b]]!\n"
		  "	 vld1.32 {q8, q9}, [%[a]]!\n"
		  "	 vld1.32 {q6, q7}, [%[b]]!\n"
		  "	 vld1.32 {q10, q11}, [%[a]]!\n"
		  "	 subs %[len], %[len], #16\n"
		  "	 vmla.f32 q0, q4, q8\n"
		  "	 vmla.f32 q1, q5, q9\n"
		  "	 vmla.f32 q2, q6, q10\n"
		  "	 vmla.f32 q3, q7, q11\n"
		  "	 bne 2b\n"
		  "3:"
		  "	 vadd.f32 q4, q0, q1\n"
		  "	 vadd.f32 q5, q2, q3\n"
		  "	 cmp %[remainder], #0\n"
		  "	 vadd.f32 q0, q4, q5\n"
		  "	 beq 5f\n"
		  "4:"
		  "	 vld1.32 {q6}, [%[b]]!\n"
		  "	 vld1.32 {q10}, [%[a]]!\n"
		  "	 subs %[remainder], %[remainder], #4\n"
		  "	 vmla.f32 q0, q6, q10\n"
		  "	 bne 4b\n"
		  "5:"
		  "	 vadd.f32 d0, d0, d1\n"
		  "	 vpadd.f32 d0, d0, d0\n"
		  "	 vmov.f32 %[ret], d0[0]\n"
		  : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
		    [len] "+l" (len), [remainder] "+l" (remainder)
		  :
		  : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
                    "q9", "q10", "q11");
    return ret;
}