void convolution_f32_avx_s_1d_v_sq_scanline_17()

in libvmaf/src/feature/common/convolution_avx.c [1268:1405]


void convolution_f32_avx_s_1d_v_sq_scanline_17(const float * RESTRICT filter, int filter_width, const float * RESTRICT src, float * RESTRICT dst, int src_stride, int j_end)
{
    (void) filter_width;

	__m256 f0, f1, f2, f3, f4, f5, f6, f7, f8;
	src -= 8 * src_stride; // radius = 8

						   // Evaluate filter taps 0-8
	f0 = _mm256_broadcast_ss(filter + 0);
	f1 = _mm256_broadcast_ss(filter + 1);
	f2 = _mm256_broadcast_ss(filter + 2);
	f3 = _mm256_broadcast_ss(filter + 3);
	f4 = _mm256_broadcast_ss(filter + 4);
	f5 = _mm256_broadcast_ss(filter + 5);
	f6 = _mm256_broadcast_ss(filter + 6);
	f7 = _mm256_broadcast_ss(filter + 7);
	f8 = _mm256_broadcast_ss(filter + 8);

	for (int j = 0; j < j_end; j += 8) {
		__m256 sum0, sum1, sum2, sum3;
		__m256 g;

		g = _mm256_load_ps(src + 0 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f0, g);
		sum0 = g;

		g = _mm256_load_ps(src + 1 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f1, g);
		sum1 = g;

		g = _mm256_load_ps(src + 2 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f2, g);
		sum2 = g;

		g = _mm256_load_ps(src + 3 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f3, g);
		sum3 = g;

		g = _mm256_load_ps(src + 4 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f4, g);
		sum0 = _mm256_add_ps(sum0, g);

		g = _mm256_load_ps(src + 5 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f5, g);
		sum1 = _mm256_add_ps(sum1, g);

		g = _mm256_load_ps(src + 6 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f6, g);
		sum2 = _mm256_add_ps(sum2, g);

		g = _mm256_load_ps(src + 7 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f7, g);
		sum3 = _mm256_add_ps(sum3, g);

		g = _mm256_load_ps(src + 8 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f8, g);
		sum0 = _mm256_add_ps(sum0, g);

		sum0 = _mm256_add_ps(sum0, sum2);
		sum1 = _mm256_add_ps(sum1, sum3);

		sum0 = _mm256_add_ps(sum0, sum1);

		_mm256_store_ps(dst + j, sum0);
	}

	// Evaluate filter taps 9-16
	f0 = _mm256_broadcast_ss(filter + 9);
	f1 = _mm256_broadcast_ss(filter + 10);
	f2 = _mm256_broadcast_ss(filter + 11);
	f3 = _mm256_broadcast_ss(filter + 12);
	f4 = _mm256_broadcast_ss(filter + 13);
	f5 = _mm256_broadcast_ss(filter + 14);
	f6 = _mm256_broadcast_ss(filter + 15);
	f7 = _mm256_broadcast_ss(filter + 16);

	for (int j = 0; j < j_end; j += 8) {
		__m256 sum0, sum1, sum2, sum3;
		__m256 g;

		g = _mm256_load_ps(src + 9 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f0, g);
		sum0 = g;

		g = _mm256_load_ps(src + 10 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f1, g);
		sum1 = g;

		g = _mm256_load_ps(src + 11 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f2, g);
		sum2 = g;

		g = _mm256_load_ps(src + 12 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f3, g);
		sum3 = g;

		g = _mm256_load_ps(src + 13 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f4, g);
		sum0 = _mm256_add_ps(sum0, g);

		g = _mm256_load_ps(src + 14 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f5, g);
		sum1 = _mm256_add_ps(sum1, g);

		g = _mm256_load_ps(src + 15 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f6, g);
		sum2 = _mm256_add_ps(sum2, g);

		g = _mm256_load_ps(src + 16 * src_stride + j);
		g = _mm256_mul_ps(g, g);
		g = _mm256_mul_ps(f7, g);
		sum3 = _mm256_add_ps(sum3, g);

		sum0 = _mm256_add_ps(sum0, sum2);
		sum1 = _mm256_add_ps(sum1, sum3);

		sum0 = _mm256_add_ps(sum0, sum1);

		sum0 = _mm256_add_ps(_mm256_load_ps(dst + j), sum0);
		_mm256_store_ps(dst + j, sum0);
	}
}