in libvmaf/src/feature/x86/vif_avx2.c [1386:1559]
void vif_subsample_rd_16_avx2(VifBuffer buf, unsigned w, unsigned h, int scale,
int bpc) {
const unsigned fwidth = vif_filter1d_width[scale + 1];
const uint16_t *vif_filt = vif_filter1d_table[scale + 1];
int32_t add_shift_round_VP, shift_VP;
int fwidth_half = fwidth >> 1;
const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
const ptrdiff_t stride16 = buf.stride_16 / sizeof(uint16_t);
uint16_t *ref = buf.ref;
uint16_t *dis = buf.dis;
__m256i mask1 = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0);
if (scale == 0) {
add_shift_round_VP = 1 << (bpc - 1);
shift_VP = bpc;
} else {
add_shift_round_VP = 32768;
shift_VP = 16;
}
for (unsigned i = 0; i < h / 2; i++) {
// VERTICAL
unsigned n = w >> 4;
int ii = i * 2 - fwidth_half;
for (unsigned j = 0; j < n << 4; j = j + 16) {
int ii_check = ii;
__m256i accumr_lo, accumr_hi, accumd_lo, accumd_hi, rmul1, rmul2,
dmul1, dmul2;
accumr_lo = accumr_hi = accumd_lo = accumd_hi = rmul1 = rmul2 =
dmul1 = dmul2 = _mm256_setzero_si256();
for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi) {
__m256i f1 = _mm256_set1_epi16(vif_filt[fi]);
__m256i ref1 = _mm256_loadu_si256(
(__m256i *)(ref + (ii_check * stride) + j));
__m256i dis1 = _mm256_loadu_si256(
(__m256i *)(dis + (ii_check * stride) + j));
__m256i result2 = _mm256_mulhi_epu16(ref1, f1);
__m256i result2lo = _mm256_mullo_epi16(ref1, f1);
rmul1 = _mm256_unpacklo_epi16(result2lo, result2);
rmul2 = _mm256_unpackhi_epi16(result2lo, result2);
accumr_lo = _mm256_add_epi32(accumr_lo, rmul1);
accumr_hi = _mm256_add_epi32(accumr_hi, rmul2);
__m256i d0 = _mm256_mulhi_epu16(dis1, f1);
__m256i d0lo = _mm256_mullo_epi16(dis1, f1);
dmul1 = _mm256_unpacklo_epi16(d0lo, d0);
dmul2 = _mm256_unpackhi_epi16(d0lo, d0);
accumd_lo = _mm256_add_epi32(accumd_lo, dmul1);
accumd_hi = _mm256_add_epi32(accumd_hi, dmul2);
}
__m256i addnum = _mm256_set1_epi32(add_shift_round_VP);
accumr_lo = _mm256_add_epi32(accumr_lo, addnum);
accumr_hi = _mm256_add_epi32(accumr_hi, addnum);
accumr_lo = _mm256_srli_epi32(accumr_lo, shift_VP);
accumr_hi = _mm256_srli_epi32(accumr_hi, shift_VP);
__m256i accumu2_lo =
_mm256_permute2x128_si256(accumr_lo, accumr_hi, 0x20);
__m256i accumu2_hi =
_mm256_permute2x128_si256(accumr_lo, accumr_hi, 0x31);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_convol + j),
accumu2_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_convol + j + 8),
accumu2_hi);
accumd_lo = _mm256_add_epi32(accumd_lo, addnum);
accumd_hi = _mm256_add_epi32(accumd_hi, addnum);
accumd_lo = _mm256_srli_epi32(accumd_lo, shift_VP);
accumd_hi = _mm256_srli_epi32(accumd_hi, shift_VP);
accumu2_lo = _mm256_permute2x128_si256(accumd_lo, accumd_hi, 0x20);
accumu2_hi = _mm256_permute2x128_si256(accumd_lo, accumd_hi, 0x31);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis_convol + j),
accumu2_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis_convol + j + 8),
accumu2_hi);
}
// VERTICAL
for (unsigned j = n << 4; j < w; ++j) {
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
int ii_check = ii;
for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi) {
const uint16_t fcoeff = vif_filt[fi];
accum_ref += fcoeff * ((uint32_t)ref[ii_check * stride + j]);
accum_dis += fcoeff * ((uint32_t)dis[ii_check * stride + j]);
}
buf.tmp.ref_convol[j] =
(uint16_t)((accum_ref + add_shift_round_VP) >> shift_VP);
buf.tmp.dis_convol[j] =
(uint16_t)((accum_dis + add_shift_round_VP) >> shift_VP);
}
PADDING_SQ_DATA_2(buf, w, fwidth_half);
// HORIZONTAL
n = w >> 3;
for (unsigned j = 0; j < n << 3; j = j + 8) {
int jj = j - fwidth_half;
int jj_check = jj;
__m256i accumrlo, accumdlo, accumrhi, accumdhi;
accumrlo = accumdlo = accumrhi = accumdhi = _mm256_setzero_si256();
for (unsigned fj = 0; fj < fwidth; ++fj, jj_check = jj + fj) {
__m256i refconvol = _mm256_loadu_si256(
(__m256i *)(buf.tmp.ref_convol + jj_check));
__m256i fcoeff = _mm256_set1_epi16(vif_filt[fj]);
__m256i result2 = _mm256_mulhi_epu16(refconvol, fcoeff);
__m256i result2lo = _mm256_mullo_epi16(refconvol, fcoeff);
accumrlo = _mm256_add_epi32(
accumrlo, _mm256_unpacklo_epi16(result2lo, result2));
accumrhi = _mm256_add_epi32(
accumrhi, _mm256_unpackhi_epi16(result2lo, result2));
__m256i disconvol = _mm256_loadu_si256(
(__m256i *)(buf.tmp.dis_convol + jj_check));
result2 = _mm256_mulhi_epu16(disconvol, fcoeff);
result2lo = _mm256_mullo_epi16(disconvol, fcoeff);
accumdlo = _mm256_add_epi32(
accumdlo, _mm256_unpacklo_epi16(result2lo, result2));
accumdhi = _mm256_add_epi32(
accumdhi, _mm256_unpackhi_epi16(result2lo, result2));
}
__m256i addnum = _mm256_set1_epi32(32768);
accumdlo = _mm256_add_epi32(accumdlo, addnum);
accumdhi = _mm256_add_epi32(accumdhi, addnum);
accumrlo = _mm256_add_epi32(accumrlo, addnum);
accumrhi = _mm256_add_epi32(accumrhi, addnum);
accumdlo = _mm256_srli_epi32(accumdlo, 0x10);
accumdhi = _mm256_srli_epi32(accumdhi, 0x10);
accumrlo = _mm256_srli_epi32(accumrlo, 0x10);
accumrhi = _mm256_srli_epi32(accumrhi, 0x10);
__m256i result = _mm256_packus_epi32(accumdlo, accumdhi);
__m256i resultd = _mm256_packus_epi32(accumrlo, accumrhi);
__m256i resulttmp = _mm256_srli_si256(resultd, 2);
resultd = _mm256_blend_epi16(resultd, resulttmp, 0xAA);
resultd = _mm256_permutevar8x32_epi32(resultd, mask1);
_mm_storeu_si128((__m128i *)(buf.mu1 + i * stride16 + j),
_mm256_castsi256_si128(resultd));
resulttmp = _mm256_srli_si256(result, 2);
result = _mm256_blend_epi16(result, resulttmp, 0xAA);
result = _mm256_permutevar8x32_epi32(result, mask1);
_mm_storeu_si128((__m128i *)(buf.mu2 + i * stride16 + j),
_mm256_castsi256_si128(result));
}
for (unsigned j = n << 3; j < w; ++j) {
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
int jj = j - fwidth_half;
int jj_check = jj;
for (unsigned fj = 0; fj < fwidth; ++fj, jj_check = jj + fj) {
const uint16_t fcoeff = vif_filt[fj];
accum_ref += fcoeff * ((uint32_t)buf.tmp.ref_convol[jj_check]);
accum_dis += fcoeff * ((uint32_t)buf.tmp.dis_convol[jj_check]);
}
buf.mu1[i * stride16 + j] = (uint16_t)((accum_ref + 32768) >> 16);
buf.mu2[i * stride16 + j] = (uint16_t)((accum_dis + 32768) >> 16);
}
}
ref = buf.ref;
dis = buf.dis;
for (unsigned i = 0; i < h / 2; ++i) {
for (unsigned j = 0; j < w / 2; ++j) {
ref[i * stride + j] = buf.mu1[i * stride16 + (j * 2)];
dis[i * stride + j] = buf.mu2[i * stride16 + (j * 2)];
}
}
pad_top_and_bottom(buf, h / 2, vif_filter1d_width[scale]);
}