in libvmaf/src/feature/x86/vif_avx512.c [759:1103]
void vif_subsample_rd_8_avx512(VifBuffer buf, unsigned w, unsigned h)
{
const unsigned fwidth = vif_filter1d_width[1];
const uint16_t *vif_filt_s1 = vif_filter1d_table[1];
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
const ptrdiff_t stride = buf.stride_16 / sizeof(uint16_t);
__m512i addnum = _mm512_set1_epi32(32768);
// __m512i mask1 = _mm512_set_epi16(60, 56, 28, 24, 52, 48, 20, 16, 44,
// 40, 12, 8, 36, 32, 4, 0, 60, 56, 28, 24,
// 52, 48, 20, 16, 44, 40, 12, 8, 36, 32, 4, 0);
const int M = 1 << 16;
__m512i mask1 = _mm512_set_epi32(60 * M + 56, 28 * M + 24, 52 * M + 48, 20 * M + 16,
44 * M + 40, 12 * M + 8, 36 * M + 32, 4 * M + 0,
60 * M + 56, 28 * M + 24, 52 * M + 48, 20 * M + 16,
44 * M + 40, 12 * M + 8, 36 * M + 32, 4 * M + 0);
__m512i x = _mm512_set1_epi32(128);
__m512i mask2 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
__m512i mask3 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
int fwidth_half = fwidth >> 1;
__m512i f0, f1, f2, f3, f4;
f0 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)vif_filt_s1));
f1 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 2)));
f2 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 4)));
f3 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 6)));
f4 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 8)));
__m512i fcoeff = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)vif_filt_s1));
__m512i fcoeff1 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 1)));
__m512i fcoeff2 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 2)));
__m512i fcoeff3 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 3)));
__m512i fcoeff4 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 4)));
for (unsigned i = 0; i < h; ++i)
{
//VERTICAL
int n = w >> 5;
int ii = i - fwidth_half;
for (int j = 0; j < n << 5; j = j + 32)
{
int ii_check = ii;
__m512i accum_mu2_lo, accum_mu1_lo, accum_mu2_hi, accum_mu1_hi;
accum_mu2_lo = accum_mu2_hi = accum_mu1_lo = accum_mu1_hi = _mm512_setzero_si512();
{
__m512i g0, g1, g2, g3, g4, g5, g6, g7, g8, g9;
__m512i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
g0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + (buf.stride * ii_check) + j)));
g1 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check) + buf.stride + j)));
g2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 2) + j)));
g3 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 3) + j)));
g4 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 4) + j)));
g5 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 5) + j)));
g6 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 6) + j)));
g7 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 7) + j)));
g8 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 8) + j)));
g9 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 9) + j)));
s0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + (buf.stride * ii_check) + j)));
s1 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 1) + j)));
s2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 2) + j)));
s3 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 3) + j)));
s4 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 4) + j)));
s5 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 5) + j)));
s6 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 6) + j)));
s7 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 7) + j)));
s8 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 8) + j)));
s9 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 9) + j)));
__m512i s0lo = _mm512_unpacklo_epi16(s0, s1);
__m512i s0hi = _mm512_unpackhi_epi16(s0, s1);
accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
_mm512_madd_epi16(s0lo, f0));
accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
_mm512_madd_epi16(s0hi, f0));
__m512i s1lo = _mm512_unpacklo_epi16(s2, s3);
__m512i s1hi = _mm512_unpackhi_epi16(s2, s3);
accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
_mm512_madd_epi16(s1lo, f1));
accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
_mm512_madd_epi16(s1hi, f1));
__m512i s2lo = _mm512_unpacklo_epi16(s4, s5);
__m512i s2hi = _mm512_unpackhi_epi16(s4, s5);
accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
_mm512_madd_epi16(s2lo, f2));
accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
_mm512_madd_epi16(s2hi, f2));
__m512i s3lo = _mm512_unpacklo_epi16(s6, s7);
__m512i s3hi = _mm512_unpackhi_epi16(s6, s7);
accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
_mm512_madd_epi16(s3lo, f3));
accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
_mm512_madd_epi16(s3hi, f3));
__m512i s4lo = _mm512_unpacklo_epi16(s8, s9);
__m512i s4hi = _mm512_unpackhi_epi16(s8, s9);
accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
_mm512_madd_epi16(s4lo, f4));
accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
_mm512_madd_epi16(s4hi, f4));
__m512i g0lo = _mm512_unpacklo_epi16(g0, g1);
__m512i g0hi = _mm512_unpackhi_epi16(g0, g1);
accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
_mm512_madd_epi16(g0lo, f0));
accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
_mm512_madd_epi16(g0hi, f0));
__m512i g1lo = _mm512_unpacklo_epi16(g2, g3);
__m512i g1hi = _mm512_unpackhi_epi16(g2, g3);
accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
_mm512_madd_epi16(g1lo, f1));
accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
_mm512_madd_epi16(g1hi, f1));
__m512i g2lo = _mm512_unpacklo_epi16(g4, g5);
__m512i g2hi = _mm512_unpackhi_epi16(g4, g5);
accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
_mm512_madd_epi16(g2lo, f2));
accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
_mm512_madd_epi16(g2hi, f2));
__m512i g3lo = _mm512_unpacklo_epi16(g6, g7);
__m512i g3hi = _mm512_unpackhi_epi16(g6, g7);
accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
_mm512_madd_epi16(g3lo, f3));
accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
_mm512_madd_epi16(g3hi, f3));
__m512i g4lo = _mm512_unpacklo_epi16(g8, g9);
__m512i g4hi = _mm512_unpackhi_epi16(g8, g9);
accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
_mm512_madd_epi16(g4lo, f4));
accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
_mm512_madd_epi16(g4hi, f4));
}
__m512i accumu1_lo = _mm512_add_epi32(x,
_mm512_permutex2var_epi64(accum_mu1_lo, mask2, accum_mu1_hi));
__m512i accumu1_hi = _mm512_add_epi32(x,
_mm512_permutex2var_epi64(accum_mu1_lo, mask3, accum_mu1_hi));
__m512i accumu2_lo = _mm512_add_epi32(x,
_mm512_permutex2var_epi64(accum_mu2_lo, mask2, accum_mu2_hi));
__m512i accumu2_hi = _mm512_add_epi32(x,
_mm512_permutex2var_epi64(accum_mu2_lo, mask3, accum_mu2_hi));
accumu1_lo = _mm512_srli_epi32(accumu1_lo, 0x08);
accumu1_hi = _mm512_srli_epi32(accumu1_hi, 0x08);
accumu2_lo = _mm512_srli_epi32(accumu2_lo, 0x08);
accumu2_hi = _mm512_srli_epi32(accumu2_hi, 0x08);
_mm512_storeu_si512((__m512i *)(buf.tmp.ref_convol + j), accumu1_lo);
_mm512_storeu_si512((__m512i *)(buf.tmp.ref_convol + j + 16), accumu1_hi);
_mm512_storeu_si512((__m512i *)(buf.tmp.dis_convol + j), accumu2_lo);
_mm512_storeu_si512((__m512i *)(buf.tmp.dis_convol + j + 16), accumu2_hi);
}
for (unsigned j = n << 5; j < w; ++j)
{
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi)
{
int ii = i - fwidth_half;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt_s1[fi];
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
accum_ref += fcoeff * (uint32_t)ref[ii_check * buf.stride + j];
accum_dis += fcoeff * (uint32_t)dis[ii_check * buf.stride + j];
}
buf.tmp.ref_convol[j] = (accum_ref + 128) >> 8;
buf.tmp.dis_convol[j] = (accum_dis + 128) >> 8;
}
PADDING_SQ_DATA_2(buf, w, fwidth_half);
//HORIZONTAL
n = w >> 4;
for (int j = 0; j < n << 4; j = j + 16)
{
int jj = j - fwidth_half;
int jj_check = jj;
__m512i accumrlo, accumdlo, accumrhi, accumdhi, padzero;
accumrlo = accumdlo = accumrhi = accumdhi = padzero = _mm512_setzero_si512();
{
__m512i refconvol = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check));
__m512i refconvol1 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 1));
__m512i refconvol2 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 2));
__m512i refconvol3 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 3));
__m512i refconvol4 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 4));
__m512i refconvol5 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 5));
__m512i refconvol6 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 6));
__m512i refconvol7 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 7));
__m512i refconvol8 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 8));
__m512i result2 = _mm512_mulhi_epu16(refconvol, fcoeff);
__m512i result2lo = _mm512_mullo_epi16(refconvol, fcoeff);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result2lo, result2));
accumrhi = _mm512_add_epi32(accumrhi, _mm512_unpackhi_epi16(result2lo, result2));
__m512i result3 = _mm512_mulhi_epu16(refconvol1, fcoeff1);
__m512i result3lo = _mm512_mullo_epi16(refconvol1, fcoeff1);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result3lo, result3));
accumrhi = _mm512_add_epi32(accumrhi, _mm512_unpackhi_epi16(result3lo, result3));
__m512i result4 = _mm512_mulhi_epu16(refconvol2, fcoeff2);
__m512i result4lo = _mm512_mullo_epi16(refconvol2, fcoeff2);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result4lo, result4));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result4lo, result4));
__m512i result5 = _mm512_mulhi_epu16(refconvol3, fcoeff3);
__m512i result5lo = _mm512_mullo_epi16(refconvol3, fcoeff3);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result5lo, result5));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result5lo, result5));
__m512i result6 = _mm512_mulhi_epu16(refconvol4, fcoeff4);
__m512i result6lo = _mm512_mullo_epi16(refconvol4, fcoeff4);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result6lo, result6));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result6lo, result6));
__m512i result7 = _mm512_mulhi_epu16(refconvol5, fcoeff3);
__m512i result7lo = _mm512_mullo_epi16(refconvol5, fcoeff3);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result7lo, result7));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result7lo, result7));
__m512i result8 = _mm512_mulhi_epu16(refconvol6, fcoeff2);
__m512i result8lo = _mm512_mullo_epi16(refconvol6, fcoeff2);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result8lo, result8));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result8lo, result8));
__m512i result9 = _mm512_mulhi_epu16(refconvol7, fcoeff1);
__m512i result9lo = _mm512_mullo_epi16(refconvol7, fcoeff1);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result9lo, result9));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result9lo, result9));
__m512i result10 = _mm512_mulhi_epu16(refconvol8, fcoeff);
__m512i result10lo = _mm512_mullo_epi16(refconvol8, fcoeff);
accumrlo = _mm512_add_epi32(
accumrlo, _mm512_unpacklo_epi16(result10lo, result10));
accumrhi = _mm512_add_epi32(
accumrhi, _mm512_unpackhi_epi16(result10lo, result10));
__m512i disconvol = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check));
__m512i disconvol1 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 1));
__m512i disconvol2 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 2));
__m512i disconvol3 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 3));
__m512i disconvol4 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 4));
__m512i disconvol5 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 5));
__m512i disconvol6 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 6));
__m512i disconvol7 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 7));
__m512i disconvol8 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 8));
result2 = _mm512_mulhi_epu16(disconvol, fcoeff);
result2lo = _mm512_mullo_epi16(disconvol, fcoeff);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result2lo, result2));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result2lo, result2));
result3 = _mm512_mulhi_epu16(disconvol1, fcoeff1);
result3lo = _mm512_mullo_epi16(disconvol1, fcoeff1);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result3lo, result3));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result3lo, result3));
result4 = _mm512_mulhi_epu16(disconvol2, fcoeff2);
result4lo = _mm512_mullo_epi16(disconvol2, fcoeff2);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result4lo, result4));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result4lo, result4));
result5 = _mm512_mulhi_epu16(disconvol3, fcoeff3);
result5lo = _mm512_mullo_epi16(disconvol3, fcoeff3);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result5lo, result5));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result5lo, result5));
result6 = _mm512_mulhi_epu16(disconvol4, fcoeff4);
result6lo = _mm512_mullo_epi16(disconvol4, fcoeff4);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result6lo, result6));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result6lo, result6));
result7 = _mm512_mulhi_epu16(disconvol5, fcoeff3);
result7lo = _mm512_mullo_epi16(disconvol5, fcoeff3);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result7lo, result7));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result7lo, result7));
result8 = _mm512_mulhi_epu16(disconvol6, fcoeff2);
result8lo = _mm512_mullo_epi16(disconvol6, fcoeff2);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result8lo, result8));
accumdhi = _mm512_add_epi32(accumdhi, _mm512_unpackhi_epi16(result8lo, result8));
result9 = _mm512_mulhi_epu16(
disconvol7, fcoeff1);
result9lo = _mm512_mullo_epi16(disconvol7, fcoeff1);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result9lo, result9));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result9lo, result9));
result10 = _mm512_mulhi_epu16(disconvol8, fcoeff);
result10lo = _mm512_mullo_epi16(disconvol8, fcoeff);
accumdlo = _mm512_add_epi32(
accumdlo, _mm512_unpacklo_epi16(result10lo, result10));
accumdhi = _mm512_add_epi32(
accumdhi, _mm512_unpackhi_epi16(result10lo, result10));
}
accumdlo = _mm512_add_epi32(accumdlo, addnum);
accumdhi = _mm512_add_epi32(accumdhi, addnum);
accumrlo = _mm512_add_epi32(accumrlo, addnum);
accumrhi = _mm512_add_epi32(accumrhi, addnum);
accumdlo = _mm512_srli_epi32(accumdlo, 0x10);
accumdhi = _mm512_srli_epi32(accumdhi, 0x10);
accumrlo = _mm512_srli_epi32(accumrlo, 0x10);
accumrhi = _mm512_srli_epi32(accumrhi, 0x10);
__m512i result = _mm512_permutex2var_epi16(accumdlo, mask1, accumdhi);
__m512i resultd = _mm512_permutex2var_epi16(accumrlo, mask1, accumrhi);
_mm256_storeu_si256((__m256i *)(buf.mu1 + i * stride + j), _mm512_castsi512_si256(resultd));
_mm256_storeu_si256((__m256i *)(buf.mu2 + i * stride + j), _mm512_castsi512_si256(result));
}
for (unsigned j = n << 4; j < w; ++j)
{
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
int jj = j - fwidth_half;
int jj_check = jj;
for (unsigned fj = 0; fj < fwidth; ++fj, jj_check = jj + fj)
{
const uint16_t fcoeff = vif_filt_s1[fj];
accum_ref += fcoeff * buf.tmp.ref_convol[jj_check];
accum_dis += fcoeff * buf.tmp.dis_convol[jj_check];
}
buf.mu1[i * stride + j] = (uint16_t)((accum_ref + 32768) >> 16);
buf.mu2[i * stride + j] = (uint16_t)((accum_dis + 32768) >> 16);
}
}
decimate_and_pad(buf, w, h, 0);
}