in libvmaf/src/feature/common/convolution_avx.c [857:997]
void convolution_f32_avx_s_1d_h_sq_scanline_17(const float * RESTRICT filter, int filter_width, const float * RESTRICT src, float * RESTRICT dst, int j_end)
{
(void) filter_width;
__m256 f0, f1, f2, f3, f4, f5, f6, f7, f8;
// Evaluate filter taps 0-8
f0 = _mm256_broadcast_ss(filter + 0);
f1 = _mm256_broadcast_ss(filter + 1);
f2 = _mm256_broadcast_ss(filter + 2);
f3 = _mm256_broadcast_ss(filter + 3);
f4 = _mm256_broadcast_ss(filter + 4);
f5 = _mm256_broadcast_ss(filter + 5);
f6 = _mm256_broadcast_ss(filter + 6);
f7 = _mm256_broadcast_ss(filter + 7);
f8 = _mm256_broadcast_ss(filter + 8);
for (int j = 0; j < j_end; j += 8) {
__m256 accum = _mm256_setzero_ps();
__m256 sum0, sum1, sum2, sum3;
__m256 g;
g = _mm256_loadu_ps(src + j + 0);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f0, g);
sum0 = g;
g = _mm256_loadu_ps(src + j + 1);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f1, g);
sum1 = g;
g = _mm256_loadu_ps(src + j + 2);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f2, g);
sum2 = g;
g = _mm256_loadu_ps(src + j + 3);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f3, g);
sum3 = g;
g = _mm256_loadu_ps(src + j + 4);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f4, g);
sum0 = _mm256_add_ps(sum0, g);
g = _mm256_loadu_ps(src + j + 5);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f5, g);
sum1 = _mm256_add_ps(sum1, g);
g = _mm256_loadu_ps(src + j + 6);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f6, g);
sum2 = _mm256_add_ps(sum2, g);
g = _mm256_loadu_ps(src + j + 7);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f7, g);
sum3 = _mm256_add_ps(sum3, g);
g = _mm256_loadu_ps(src + j + 8);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f8, g);
sum0 = _mm256_add_ps(sum0, g);
sum0 = _mm256_add_ps(sum0, sum2);
sum1 = _mm256_add_ps(sum1, sum3);
sum0 = _mm256_add_ps(sum0, sum1);
accum = _mm256_add_ps(accum, sum0);
_mm256_store_ps(dst + j + 8, accum); // radius = 8
}
// Evaluate filter taps 9-16
f0 = _mm256_broadcast_ss(filter + 9);
f1 = _mm256_broadcast_ss(filter + 10);
f2 = _mm256_broadcast_ss(filter + 11);
f3 = _mm256_broadcast_ss(filter + 12);
f4 = _mm256_broadcast_ss(filter + 13);
f5 = _mm256_broadcast_ss(filter + 14);
f6 = _mm256_broadcast_ss(filter + 15);
f7 = _mm256_broadcast_ss(filter + 16);
for (int j = 0; j < j_end; j += 8) {
__m256 sum0, sum1, sum2, sum3;
__m256 g;
float *dst_ptr = dst + j + 8; // radius = 8
g = _mm256_loadu_ps(src + j + 9);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f0, g);
sum0 = g;
g = _mm256_loadu_ps(src + j + 10);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f1, g);
sum1 = g;
g = _mm256_loadu_ps(src + j + 11);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f2, g);
sum2 = g;
g = _mm256_loadu_ps(src + j + 12);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f3, g);
sum3 = g;
g = _mm256_loadu_ps(src + j + 13);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f4, g);
sum0 = _mm256_add_ps(sum0, g);
g = _mm256_loadu_ps(src + j + 14);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f5, g);
sum1 = _mm256_add_ps(sum1, g);
g = _mm256_loadu_ps(src + j + 15);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f6, g);
sum2 = _mm256_add_ps(sum2, g);
g = _mm256_loadu_ps(src + j + 16);
g = _mm256_mul_ps(g, g);
g = _mm256_mul_ps(f7, g);
sum3 = _mm256_add_ps(sum3, g);
sum0 = _mm256_add_ps(sum0, sum2);
sum1 = _mm256_add_ps(sum1, sum3);
sum0 = _mm256_add_ps(sum0, sum1);
sum0 = _mm256_add_ps(_mm256_load_ps(dst_ptr), sum0);
_mm256_store_ps(dst_ptr, sum0);
}
}