in libvmaf/src/feature/x86/vif_avx2.c [1098:1384]
void vif_subsample_rd_8_avx2(VifBuffer buf, unsigned w, unsigned h) {
const unsigned fwidth = vif_filter1d_width[1];
const uint16_t *vif_filt_s1 = vif_filter1d_table[1];
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
const ptrdiff_t stride = buf.stride_16 / sizeof(uint16_t);
__m256i addnum = _mm256_set1_epi32(32768);
__m256i mask1 = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
__m256i x = _mm256_set1_epi32(128);
int fwidth_half = fwidth >> 1;
__m256i fcoeff0 = _mm256_set1_epi16(vif_filt_s1[0]);
__m256i fcoeff1 = _mm256_set1_epi16(vif_filt_s1[1]);
__m256i fcoeff2 = _mm256_set1_epi16(vif_filt_s1[2]);
__m256i fcoeff3 = _mm256_set1_epi16(vif_filt_s1[3]);
__m256i fcoeff4 = _mm256_set1_epi16(vif_filt_s1[4]);
for (unsigned i = 0; i < h / 2; i ++) {
// VERTICAL
unsigned n = w >> 4;
for (unsigned j = 0; j < n << 4; j = j + 16) {
int ii = i * 2 - fwidth_half;
int ii_check = ii;
__m256i accum_mu1_lo, accum_mu1_hi;
__m256i accum_mu2_lo, accum_mu2_hi;
__m256i g0, g1, g2, g3, g4, g5, g6, g7, g8;
__m256i s0, s1, s2, s3, s4, s5, s6, s7, s8;
g0 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + (buf.stride * ii_check) + j)));
g1 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 1) + j)));
g2 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 2) + j)));
g3 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 3) + j)));
g4 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 4) + j)));
g5 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 5) + j)));
g6 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 6) + j)));
g7 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 7) + j)));
g8 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(ref + buf.stride * (ii_check + 8) + j)));
s0 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + (buf.stride * ii_check) + j)));
s1 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 1) + j)));
s2 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 2) + j)));
s3 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 3) + j)));
s4 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 4) + j)));
s5 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 5) + j)));
s6 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 6) + j)));
s7 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 7) + j)));
s8 = _mm256_cvtepu8_epi16(_mm_loadu_si128(
(__m128i *)(dis + buf.stride * (ii_check + 8) + j)));
multiply2(accum_mu2_lo, accum_mu2_hi, s4, fcoeff4);
multiply2_and_accumulate(accum_mu2_lo, accum_mu2_hi, s0, s8, fcoeff0);
multiply2_and_accumulate(accum_mu2_lo, accum_mu2_hi, s1, s7, fcoeff1);
multiply2_and_accumulate(accum_mu2_lo, accum_mu2_hi, s2, s6, fcoeff2);
multiply2_and_accumulate(accum_mu2_lo, accum_mu2_hi, s3, s5, fcoeff3);
multiply2(accum_mu1_lo, accum_mu1_hi, g4, fcoeff4);
multiply2_and_accumulate(accum_mu1_lo, accum_mu1_hi, g0, g8, fcoeff0);
multiply2_and_accumulate(accum_mu1_lo, accum_mu1_hi, g1, g7, fcoeff1);
multiply2_and_accumulate(accum_mu1_lo, accum_mu1_hi, g2, g6, fcoeff2);
multiply2_and_accumulate(accum_mu1_lo, accum_mu1_hi, g3, g5, fcoeff3);
__m256i accumu1_lo = _mm256_add_epi32(
x, _mm256_permute2x128_si256(accum_mu1_lo, accum_mu1_hi, 0x20));
__m256i accumu1_hi = _mm256_add_epi32(
x, _mm256_permute2x128_si256(accum_mu1_lo, accum_mu1_hi, 0x31));
__m256i accumu2_lo = _mm256_add_epi32(
x, _mm256_permute2x128_si256(accum_mu2_lo, accum_mu2_hi, 0x20));
__m256i accumu2_hi = _mm256_add_epi32(
x, _mm256_permute2x128_si256(accum_mu2_lo, accum_mu2_hi, 0x31));
accumu1_lo = _mm256_srli_epi32(accumu1_lo, 0x08);
accumu1_hi = _mm256_srli_epi32(accumu1_hi, 0x08);
accumu2_lo = _mm256_srli_epi32(accumu2_lo, 0x08);
accumu2_hi = _mm256_srli_epi32(accumu2_hi, 0x08);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_convol + j),
accumu1_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_convol + j + 8),
accumu1_hi);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis_convol + j),
accumu2_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis_convol + j + 8),
accumu2_hi);
}
for (unsigned j = n << 4; j < w; ++j) {
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi) {
int ii = i * 2 - fwidth_half;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt_s1[fi];
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
accum_ref += fcoeff * (uint32_t)ref[ii_check * buf.stride + j];
accum_dis += fcoeff * (uint32_t)dis[ii_check * buf.stride + j];
}
buf.tmp.ref_convol[j] = (accum_ref + 128) >> 8;
buf.tmp.dis_convol[j] = (accum_dis + 128) >> 8;
}
PADDING_SQ_DATA_2(buf, w, fwidth_half);
// HORIZONTAL
n = w >> 3;
for (unsigned j = 0; j < n << 3; j = j + 8) {
int jj = j - fwidth_half;
int jj_check = jj;
__m256i accumrlo, accumdlo, accumrhi, accumdhi;
accumrlo = accumdlo = accumrhi = accumdhi = _mm256_setzero_si256();
__m256i refconvol0 = _mm256_loadu_si256((__m256i *)(buf.tmp.ref_convol + jj_check));
__m256i refconvol4 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_convol + jj_check + 4));
__m256i refconvol8 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_convol + jj_check + 8));
__m256i refconvol1 = _mm256_alignr_epi8(refconvol4, refconvol0, 4);
__m256i refconvol2 = _mm256_alignr_epi8(refconvol4, refconvol0, 8);
__m256i refconvol3 = _mm256_alignr_epi8(refconvol4, refconvol0, 12);
__m256i refconvol5 = _mm256_alignr_epi8(refconvol8, refconvol4, 4);
__m256i refconvol6 = _mm256_alignr_epi8(refconvol8, refconvol4, 8);
__m256i refconvol7 = _mm256_alignr_epi8(refconvol8, refconvol4, 12);
__m256i result2 = _mm256_mulhi_epu16(refconvol0, fcoeff0);
__m256i result2lo = _mm256_mullo_epi16(refconvol0, fcoeff0);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result2lo, result2));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result2lo, result2));
__m256i result3 = _mm256_mulhi_epu16(refconvol1, fcoeff1);
__m256i result3lo = _mm256_mullo_epi16(refconvol1, fcoeff1);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result3lo, result3));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result3lo, result3));
__m256i result4 = _mm256_mulhi_epu16(refconvol2, fcoeff2);
__m256i result4lo = _mm256_mullo_epi16(refconvol2, fcoeff2);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result4lo, result4));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result4lo, result4));
__m256i result5 = _mm256_mulhi_epu16(refconvol3, fcoeff3);
__m256i result5lo = _mm256_mullo_epi16(refconvol3, fcoeff3);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result5lo, result5));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result5lo, result5));
__m256i result6 = _mm256_mulhi_epu16(refconvol4, fcoeff4);
__m256i result6lo = _mm256_mullo_epi16(refconvol4, fcoeff4);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result6lo, result6));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result6lo, result6));
__m256i result7 = _mm256_mulhi_epu16(refconvol5, fcoeff3);
__m256i result7lo = _mm256_mullo_epi16(refconvol5, fcoeff3);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result7lo, result7));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result7lo, result7));
__m256i result8 = _mm256_mulhi_epu16(refconvol6, fcoeff2);
__m256i result8lo = _mm256_mullo_epi16(refconvol6, fcoeff2);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result8lo, result8));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result8lo, result8));
__m256i result9 = _mm256_mulhi_epu16(refconvol7, fcoeff1);
__m256i result9lo = _mm256_mullo_epi16(refconvol7, fcoeff1);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result9lo, result9));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result9lo, result9));
__m256i result10 = _mm256_mulhi_epu16(refconvol8, fcoeff0);
__m256i result10lo = _mm256_mullo_epi16(refconvol8, fcoeff0);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result10lo, result10));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result10lo, result10));
__m256i disconvol0 =_mm256_loadu_si256((__m256i *)(buf.tmp.dis_convol + jj_check));
__m256i disconvol4 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis_convol + jj_check + 4));
__m256i disconvol8 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis_convol + jj_check + 8));
__m256i disconvol1 = _mm256_alignr_epi8(disconvol4, disconvol0, 4);
__m256i disconvol2 = _mm256_alignr_epi8(disconvol4, disconvol0, 8);
__m256i disconvol3 = _mm256_alignr_epi8(disconvol4, disconvol0, 12);
__m256i disconvol5 = _mm256_alignr_epi8(disconvol8, disconvol4, 4);
__m256i disconvol6 = _mm256_alignr_epi8(disconvol8, disconvol4, 8);
__m256i disconvol7 = _mm256_alignr_epi8(disconvol8, disconvol4, 12);
result2 = _mm256_mulhi_epu16(disconvol0, fcoeff0);
result2lo = _mm256_mullo_epi16(disconvol0, fcoeff0);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result2lo, result2));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result2lo, result2));
result3 = _mm256_mulhi_epu16(disconvol1, fcoeff1);
result3lo = _mm256_mullo_epi16(disconvol1, fcoeff1);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result3lo, result3));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result3lo, result3));
result4 = _mm256_mulhi_epu16(disconvol2, fcoeff2);
result4lo = _mm256_mullo_epi16(disconvol2, fcoeff2);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result4lo, result4));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result4lo, result4));
result5 = _mm256_mulhi_epu16(disconvol3, fcoeff3);
result5lo = _mm256_mullo_epi16(disconvol3, fcoeff3);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result5lo, result5));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result5lo, result5));
result6 = _mm256_mulhi_epu16(disconvol4, fcoeff4);
result6lo = _mm256_mullo_epi16(disconvol4, fcoeff4);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result6lo, result6));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result6lo, result6));
result7 = _mm256_mulhi_epu16(disconvol5, fcoeff3);
result7lo = _mm256_mullo_epi16(disconvol5, fcoeff3);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result7lo, result7));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result7lo, result7));
result8 = _mm256_mulhi_epu16(disconvol6, fcoeff2);
result8lo = _mm256_mullo_epi16(disconvol6, fcoeff2);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result8lo, result8));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result8lo, result8));
result9 = _mm256_mulhi_epu16(disconvol7, fcoeff1);
result9lo = _mm256_mullo_epi16(disconvol7, fcoeff1);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result9lo, result9));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result9lo, result9));
result10 = _mm256_mulhi_epu16(disconvol8, fcoeff0);
result10lo = _mm256_mullo_epi16(disconvol8, fcoeff0);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result10lo, result10));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result10lo, result10));
accumdlo = _mm256_add_epi32(accumdlo, addnum);
accumdhi = _mm256_add_epi32(accumdhi, addnum);
accumrlo = _mm256_add_epi32(accumrlo, addnum);
accumrhi = _mm256_add_epi32(accumrhi, addnum);
accumdlo = _mm256_srli_epi32(accumdlo, 0x10);
accumdhi = _mm256_srli_epi32(accumdhi, 0x10);
accumrlo = _mm256_srli_epi32(accumrlo, 0x10);
accumrhi = _mm256_srli_epi32(accumrhi, 0x10);
__m256i result = _mm256_packus_epi32(accumdlo, accumdhi);
__m256i resultd = _mm256_packus_epi32(accumrlo, accumrhi);
resultd = _mm256_permutevar8x32_epi32(resultd, mask1);
result = _mm256_permutevar8x32_epi32(result, mask1);
resultd = _mm256_packus_epi32(resultd, resultd);
result = _mm256_packus_epi32(result, result);
_mm_storel_epi64((__m128i *)(buf.mu1 + i * stride + (j >> 1)), _mm256_castsi256_si128(resultd));
_mm_storel_epi64((__m128i *)(buf.mu2 + i * stride + (j >> 1)), _mm256_castsi256_si128(result));
}
for (unsigned j = n << 3; j < w; j += 2) {
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
int jj = j - fwidth_half;
int jj_check = jj;
for (unsigned fj = 0; fj < fwidth; ++fj, jj_check = jj + fj) {
const uint16_t fcoeff = vif_filt_s1[fj];
accum_ref += fcoeff * buf.tmp.ref_convol[jj_check];
accum_dis += fcoeff * buf.tmp.dis_convol[jj_check];
}
buf.mu1[i * stride + (j >> 1)] = (uint16_t)((accum_ref + 32768) >> 16);
buf.mu2[i * stride + (j >> 1)] = (uint16_t)((accum_dis + 32768) >> 16);
}
}
copy_and_pad(buf, w, h, 0);
}