void convolution_f32_avx_s_1d_h_sq_scanline_17()

in libvmaf/src/feature/common/convolution_avx.c [857:997]


void convolution_f32_avx_s_1d_h_sq_scanline_17(const float * RESTRICT filter, int filter_width, const float * RESTRICT src, float * RESTRICT dst, int j_end)
{
    (void) filter_width;

	__m256 f0, f1, f2, f3, f4, f5, f6, f7, f8;

	// Evaluate filter taps 0-8
	f0 = _mm256_broadcast_ss(filter + 0);
	f1 = _mm256_broadcast_ss(filter + 1);
	f2 = _mm256_broadcast_ss(filter + 2);
	f3 = _mm256_broadcast_ss(filter + 3);
	f4 = _mm256_broadcast_ss(filter + 4);
	f5 = _mm256_broadcast_ss(filter + 5);
	f6 = _mm256_broadcast_ss(filter + 6);
	f7 = _mm256_broadcast_ss(filter + 7);
	f8 = _mm256_broadcast_ss(filter + 8);

	for (int j = 0; j < j_end; j += 8) {
		__m256 accum = _mm256_setzero_ps();
		__m256 sum0, sum1, sum2, sum3;
		__m256 g;

		g = _mm256_loadu_ps(src + j + 0);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f0, g);
		sum0 = g;

		g = _mm256_loadu_ps(src + j + 1);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f1, g);
		sum1 = g;

		g = _mm256_loadu_ps(src + j + 2);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f2, g);
		sum2 = g;

		g = _mm256_loadu_ps(src + j + 3);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f3, g);
		sum3 = g;

		g = _mm256_loadu_ps(src + j + 4);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f4, g);
		sum0 = _mm256_add_ps(sum0, g);

		g = _mm256_loadu_ps(src + j + 5);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f5, g);
		sum1 = _mm256_add_ps(sum1, g);

		g = _mm256_loadu_ps(src + j + 6);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f6, g);
		sum2 = _mm256_add_ps(sum2, g);

		g = _mm256_loadu_ps(src + j + 7);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f7, g);
		sum3 = _mm256_add_ps(sum3, g);

		g = _mm256_loadu_ps(src + j + 8);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f8, g);
		sum0 = _mm256_add_ps(sum0, g);

		sum0 = _mm256_add_ps(sum0, sum2);
		sum1 = _mm256_add_ps(sum1, sum3);

		sum0 = _mm256_add_ps(sum0, sum1);
		accum = _mm256_add_ps(accum, sum0);

		_mm256_store_ps(dst + j + 8, accum); // radius = 8
	}

	// Evaluate filter taps 9-16
	f0 = _mm256_broadcast_ss(filter + 9);
	f1 = _mm256_broadcast_ss(filter + 10);
	f2 = _mm256_broadcast_ss(filter + 11);
	f3 = _mm256_broadcast_ss(filter + 12);
	f4 = _mm256_broadcast_ss(filter + 13);
	f5 = _mm256_broadcast_ss(filter + 14);
	f6 = _mm256_broadcast_ss(filter + 15);
	f7 = _mm256_broadcast_ss(filter + 16);

	for (int j = 0; j < j_end; j += 8) {
		__m256 sum0, sum1, sum2, sum3;
		__m256 g;

		float *dst_ptr = dst + j + 8; // radius = 8

		g = _mm256_loadu_ps(src + j + 9);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f0, g);
		sum0 = g;

		g = _mm256_loadu_ps(src + j + 10);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f1, g);
		sum1 = g;

		g = _mm256_loadu_ps(src + j + 11);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f2, g);
		sum2 = g;

		g = _mm256_loadu_ps(src + j + 12);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f3, g);
		sum3 = g;

		g = _mm256_loadu_ps(src + j + 13);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f4, g);
		sum0 = _mm256_add_ps(sum0, g);

		g = _mm256_loadu_ps(src + j + 14);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f5, g);
		sum1 = _mm256_add_ps(sum1, g);

		g = _mm256_loadu_ps(src + j + 15);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f6, g);
		sum2 = _mm256_add_ps(sum2, g);

		g = _mm256_loadu_ps(src + j + 16);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f7, g);
		sum3 = _mm256_add_ps(sum3, g);

		sum0 = _mm256_add_ps(sum0, sum2);
		sum1 = _mm256_add_ps(sum1, sum3);

		sum0 = _mm256_add_ps(sum0, sum1);

		sum0 = _mm256_add_ps(_mm256_load_ps(dst_ptr), sum0);
		_mm256_store_ps(dst_ptr, sum0);
	}
}