in libvmaf/src/feature/x86/vif_avx2.c [549:1096]
void vif_statistic_16_avx2(struct VifPublicState *s, float *num, float *den, unsigned w, unsigned h, int bpc, int scale) {
const unsigned fwidth = vif_filter1d_width[scale];
const uint16_t *vif_filt = vif_filter1d_table[scale];
VifBuffer buf = s->buf;
const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
int fwidth_half = fwidth >> 1;
int32_t add_shift_round_VP, shift_VP;
int32_t add_shift_round_VP_sq, shift_VP_sq;
//float equivalent of 2. (2 * 65536)
static const int32_t sigma_nsq = 65536 << 1;
int64_t accum_num_log = 0;
int64_t accum_den_log = 0;
int64_t accum_num_non_log = 0;
int64_t accum_den_non_log = 0;
const uint16_t *log2_table = s->log2_table;
double vif_enhn_gain_limit = s->vif_enhn_gain_limit;
// variables used for 16 sample block vif computation
ALIGNED(32) uint32_t xx[16];
ALIGNED(32) uint32_t yy[16];
ALIGNED(32) uint32_t xy[16];
if (scale == 0) {
shift_VP = bpc;
add_shift_round_VP = 1 << (bpc - 1);
shift_VP_sq = (bpc - 8) * 2;
add_shift_round_VP_sq = (bpc == 8) ? 0 : 1 << (shift_VP_sq - 1);
} else {
shift_VP = 16;
add_shift_round_VP = 32768;
shift_VP_sq = 16;
add_shift_round_VP_sq = 32768;
}
for (unsigned i = 0; i < h; ++i) {
// VERTICAL
int ii = i - fwidth_half;
unsigned n = w >> 4;
for (unsigned j = 0; j < n << 4; j = j + 16) {
__m256i mask2 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
int ii_check = ii;
uint16_t *ref = buf.ref;
uint16_t *dis = buf.dis;
__m256i accumr_lo, accumr_hi, accumd_lo, accumd_hi, rmul1, rmul2,
dmul1, dmul2, accumref1, accumref2, accumref3, accumref4,
accumrefdis1, accumrefdis2, accumrefdis3, accumrefdis4,
accumdis1, accumdis2, accumdis3, accumdis4;
accumr_lo = accumr_hi = accumd_lo = accumd_hi = rmul1 = rmul2 =
dmul1 = dmul2 = accumref1 = accumref2 = accumref3 = accumref4 =
accumrefdis1 = accumrefdis2 = accumrefdis3 = accumrefdis4 =
accumdis1 = accumdis2 = accumdis3 = accumdis4 =
_mm256_setzero_si256();
__m256i addnum = _mm256_set1_epi32(add_shift_round_VP);
for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi) {
__m256i f1 = _mm256_set1_epi16(vif_filt[fi]);
__m256i ref1 = _mm256_loadu_si256(
(__m256i *)(ref + (ii_check * stride) + j));
__m256i dis1 = _mm256_loadu_si256(
(__m256i *)(dis + (ii_check * stride) + j));
__m256i result2 = _mm256_mulhi_epu16(ref1, f1);
__m256i result2lo = _mm256_mullo_epi16(ref1, f1);
rmul1 = _mm256_unpacklo_epi16(result2lo, result2);
rmul2 = _mm256_unpackhi_epi16(result2lo, result2);
accumr_lo = _mm256_add_epi32(accumr_lo, rmul1);
accumr_hi = _mm256_add_epi32(accumr_hi, rmul2);
__m256i d0 = _mm256_mulhi_epu16(dis1, f1);
__m256i d0lo = _mm256_mullo_epi16(dis1, f1);
dmul1 = _mm256_unpacklo_epi16(d0lo, d0);
dmul2 = _mm256_unpackhi_epi16(d0lo, d0);
accumd_lo = _mm256_add_epi32(accumd_lo, dmul1);
accumd_hi = _mm256_add_epi32(accumd_hi, dmul2);
__m256i sg0 =
_mm256_cvtepu32_epi64(_mm256_castsi256_si128(rmul1));
__m256i sg1 =
_mm256_cvtepu32_epi64(_mm256_extracti128_si256(rmul1, 1));
__m256i sg2 =
_mm256_cvtepu32_epi64(_mm256_castsi256_si128(rmul2));
__m256i sg3 =
_mm256_cvtepu32_epi64(_mm256_extracti128_si256(rmul2, 1));
__m128i l0 = _mm256_castsi256_si128(ref1);
__m128i l1 = _mm256_extracti128_si256(ref1, 1);
accumref1 = _mm256_add_epi64(
accumref1,
_mm256_mul_epu32(sg0, _mm256_cvtepu16_epi64(l0)));
accumref2 = _mm256_add_epi64(
accumref2,
_mm256_mul_epu32(
sg2, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l0, 8))));
accumref3 = _mm256_add_epi64(
accumref3,
_mm256_mul_epu32(sg1, _mm256_cvtepu16_epi64(l1)));
accumref4 = _mm256_add_epi64(
accumref4,
_mm256_mul_epu32(
sg3, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l1, 8))));
l0 = _mm256_castsi256_si128(dis1);
l1 = _mm256_extracti128_si256(dis1, 1);
accumrefdis1 = _mm256_add_epi64(
accumrefdis1,
_mm256_mul_epu32(sg0, _mm256_cvtepu16_epi64(l0)));
accumrefdis2 = _mm256_add_epi64(
accumrefdis2,
_mm256_mul_epu32(
sg2, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l0, 8))));
accumrefdis3 = _mm256_add_epi64(
accumrefdis3,
_mm256_mul_epu32(sg1, _mm256_cvtepu16_epi64(l1)));
accumrefdis4 = _mm256_add_epi64(
accumrefdis4,
_mm256_mul_epu32(
sg3, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l1, 8))));
__m256i sd0 =
_mm256_cvtepu32_epi64(_mm256_castsi256_si128(dmul1));
__m256i sd1 =
_mm256_cvtepu32_epi64(_mm256_extracti128_si256(dmul1, 1));
__m256i sd2 =
_mm256_cvtepu32_epi64(_mm256_castsi256_si128(dmul2));
__m256i sd3 =
_mm256_cvtepu32_epi64(_mm256_extracti128_si256(dmul2, 1));
accumdis1 = _mm256_add_epi64(
accumdis1,
_mm256_mul_epu32(sd0, _mm256_cvtepu16_epi64(l0)));
accumdis2 = _mm256_add_epi64(
accumdis2,
_mm256_mul_epu32(
sd2, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l0, 8))));
accumdis3 = _mm256_add_epi64(
accumdis3,
_mm256_mul_epu32(sd1, _mm256_cvtepu16_epi64(l1)));
accumdis4 = _mm256_add_epi64(
accumdis4,
_mm256_mul_epu32(
sd3, _mm256_cvtepu16_epi64(_mm_bsrli_si128(l1, 8))));
}
accumr_lo = _mm256_add_epi32(accumr_lo, addnum);
accumr_hi = _mm256_add_epi32(accumr_hi, addnum);
accumr_lo = _mm256_srli_epi32(accumr_lo, shift_VP);
accumr_hi = _mm256_srli_epi32(accumr_hi, shift_VP);
__m256i accumu2_lo =
_mm256_permute2x128_si256(accumr_lo, accumr_hi, 0x20);
__m256i accumu2_hi =
_mm256_permute2x128_si256(accumr_lo, accumr_hi, 0x31);
_mm256_storeu_si256((__m256i *)(buf.tmp.mu1 + j), accumu2_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.mu1 + j + 8), accumu2_hi);
accumd_lo = _mm256_add_epi32(accumd_lo, addnum);
accumd_hi = _mm256_add_epi32(accumd_hi, addnum);
accumd_lo = _mm256_srli_epi32(accumd_lo, shift_VP);
accumd_hi = _mm256_srli_epi32(accumd_hi, shift_VP);
__m256i accumu3_lo =
_mm256_permute2x128_si256(accumd_lo, accumd_hi, 0x20);
__m256i accumu3_hi =
_mm256_permute2x128_si256(accumd_lo, accumd_hi, 0x31);
_mm256_storeu_si256((__m256i *)(buf.tmp.mu2 + j), accumu3_lo);
_mm256_storeu_si256((__m256i *)(buf.tmp.mu2 + j + 8), accumu3_hi);
addnum = _mm256_set1_epi64x(add_shift_round_VP_sq);
accumref1 = _mm256_add_epi64(accumref1, addnum);
accumref2 = _mm256_add_epi64(accumref2, addnum);
accumref3 = _mm256_add_epi64(accumref3, addnum);
accumref4 = _mm256_add_epi64(accumref4, addnum);
accumref1 = _mm256_srli_epi64(accumref1, shift_VP_sq);
accumref2 = _mm256_srli_epi64(accumref2, shift_VP_sq);
accumref3 = _mm256_srli_epi64(accumref3, shift_VP_sq);
accumref4 = _mm256_srli_epi64(accumref4, shift_VP_sq);
accumref2 = _mm256_slli_si256(accumref2, 4);
accumref1 = _mm256_blend_epi32(accumref1, accumref2, 0xAA);
accumref1 = _mm256_permutevar8x32_epi32(accumref1, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref + j), accumref1);
accumref4 = _mm256_slli_si256(accumref4, 4);
accumref3 = _mm256_blend_epi32(accumref3, accumref4, 0xAA);
accumref3 = _mm256_permutevar8x32_epi32(accumref3, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref + j + 8), accumref3);
accumrefdis1 = _mm256_add_epi64(accumrefdis1, addnum);
accumrefdis2 = _mm256_add_epi64(accumrefdis2, addnum);
accumrefdis3 = _mm256_add_epi64(accumrefdis3, addnum);
accumrefdis4 = _mm256_add_epi64(accumrefdis4, addnum);
accumrefdis1 = _mm256_srli_epi64(accumrefdis1, shift_VP_sq);
accumrefdis2 = _mm256_srli_epi64(accumrefdis2, shift_VP_sq);
accumrefdis3 = _mm256_srli_epi64(accumrefdis3, shift_VP_sq);
accumrefdis4 = _mm256_srli_epi64(accumrefdis4, shift_VP_sq);
accumrefdis2 = _mm256_slli_si256(accumrefdis2, 4);
accumrefdis1 = _mm256_blend_epi32(accumrefdis1, accumrefdis2, 0xAA);
accumrefdis1 = _mm256_permutevar8x32_epi32(accumrefdis1, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_dis + j), accumrefdis1);
accumrefdis4 = _mm256_slli_si256(accumrefdis4, 4);
accumrefdis3 = _mm256_blend_epi32(accumrefdis3, accumrefdis4, 0xAA);
accumrefdis3 = _mm256_permutevar8x32_epi32(accumrefdis3, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.ref_dis + j + 8),
accumrefdis3);
accumdis1 = _mm256_add_epi64(accumdis1, addnum);
accumdis2 = _mm256_add_epi64(accumdis2, addnum);
accumdis3 = _mm256_add_epi64(accumdis3, addnum);
accumdis4 = _mm256_add_epi64(accumdis4, addnum);
accumdis1 = _mm256_srli_epi64(accumdis1, shift_VP_sq);
accumdis2 = _mm256_srli_epi64(accumdis2, shift_VP_sq);
accumdis3 = _mm256_srli_epi64(accumdis3, shift_VP_sq);
accumdis4 = _mm256_srli_epi64(accumdis4, shift_VP_sq);
accumdis2 = _mm256_slli_si256(accumdis2, 4);
accumdis1 = _mm256_blend_epi32(accumdis1, accumdis2, 0xAA);
accumdis1 = _mm256_permutevar8x32_epi32(accumdis1, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis + j), accumdis1);
accumdis4 = _mm256_slli_si256(accumdis4, 4);
accumdis3 = _mm256_blend_epi32(accumdis3, accumdis4, 0xAA);
accumdis3 = _mm256_permutevar8x32_epi32(accumdis3, mask2);
_mm256_storeu_si256((__m256i *)(buf.tmp.dis + j + 8), accumdis3);
}
for (unsigned j = n << 4; j < w; ++j) {
uint32_t accum_mu1 = 0;
uint32_t accum_mu2 = 0;
uint64_t accum_ref = 0;
uint64_t accum_dis = 0;
uint64_t accum_ref_dis = 0;
int ii_check = ii;
for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi) {
const uint16_t fcoeff = vif_filt[fi];
uint16_t *ref = buf.ref;
uint16_t *dis = buf.dis;
uint16_t imgcoeff_ref = ref[ii_check * stride + j];
uint16_t imgcoeff_dis = dis[ii_check * stride + j];
uint32_t img_coeff_ref = fcoeff * (uint32_t)imgcoeff_ref;
uint32_t img_coeff_dis = fcoeff * (uint32_t)imgcoeff_dis;
accum_mu1 += img_coeff_ref;
accum_mu2 += img_coeff_dis;
accum_ref += img_coeff_ref * (uint64_t)imgcoeff_ref;
accum_dis += img_coeff_dis * (uint64_t)imgcoeff_dis;
accum_ref_dis += img_coeff_ref * (uint64_t)imgcoeff_dis;
}
buf.tmp.mu1[j] =
(uint16_t)((accum_mu1 + add_shift_round_VP) >> shift_VP);
buf.tmp.mu2[j] =
(uint16_t)((accum_mu2 + add_shift_round_VP) >> shift_VP);
buf.tmp.ref[j] =
(uint32_t)((accum_ref + add_shift_round_VP_sq) >> shift_VP_sq);
buf.tmp.ref_dis[j] = (uint32_t)(
(accum_ref_dis + add_shift_round_VP_sq) >> shift_VP_sq);
buf.tmp.dis[j] =
(uint32_t)((accum_dis + add_shift_round_VP_sq) >> shift_VP_sq);
}
PADDING_SQ_DATA(buf, w, fwidth_half);
//HORIZONTAL
for (unsigned jj = 0; jj < n << 4; jj += 16) {
__m256i mu1_lo;
__m256i mu1_hi;
__m256i mu1sq_lo;
__m256i mu1sq_hi;
__m256i mu2sq_lo;
__m256i mu2sq_hi;
__m256i mu1mu2_lo;
__m256i mu1mu2_hi;
{
__m256i fq = _mm256_set1_epi32(vif_filt[fwidth / 2]);
__m256i acc0 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj + 0)), fq);
__m256i acc1 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj + 8)), fq);
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi32(vif_filt[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj - fwidth / 2 + fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj - fwidth / 2 + fj + 8)), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj + fwidth / 2 - fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + jj + fwidth / 2 - fj + 8)), fq));
}
mu1_lo = acc0;
mu1_hi = acc1;
__m256i acc0_lo = _mm256_unpacklo_epi32(acc0, _mm256_setzero_si256());
__m256i acc0_hi = _mm256_unpackhi_epi32(acc0, _mm256_setzero_si256());
acc0_lo = _mm256_mul_epu32(acc0_lo, acc0_lo);
acc0_hi = _mm256_mul_epu32(acc0_hi, acc0_hi);
acc0_lo = _mm256_srli_epi64(_mm256_add_epi64(acc0_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc0_hi = _mm256_srli_epi64(_mm256_add_epi64(acc0_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc1_lo = _mm256_unpacklo_epi32(acc1, _mm256_setzero_si256());
__m256i acc1_hi = _mm256_unpackhi_epi32(acc1, _mm256_setzero_si256());
acc1_lo = _mm256_mul_epu32(acc1_lo, acc1_lo);
acc1_hi = _mm256_mul_epu32(acc1_hi, acc1_hi);
acc1_lo = _mm256_srli_epi64(_mm256_add_epi64(acc1_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc1_hi = _mm256_srli_epi64(_mm256_add_epi64(acc1_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc0_sq = _mm256_blend_epi32(acc0_lo, _mm256_slli_si256(acc0_hi, 4), 0xAA);
acc0_sq = _mm256_shuffle_epi32(acc0_sq, 0xD8);
__m256i acc1_sq = _mm256_blend_epi32(acc1_lo, _mm256_slli_si256(acc1_hi, 4), 0xAA);
acc1_sq = _mm256_shuffle_epi32(acc1_sq, 0xD8);
mu1sq_lo = acc0_sq;
mu1sq_hi = acc1_sq;
}
{
__m256i fq = _mm256_set1_epi32(vif_filt[fwidth / 2]);
__m256i acc0 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj + 0)), fq);
__m256i acc1 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj + 8)), fq);
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi32(vif_filt[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj - fwidth / 2 + fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj - fwidth / 2 + fj + 8)), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj + fwidth / 2 - fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + jj + fwidth / 2 - fj + 8)), fq));
}
__m256i acc0_lo = _mm256_unpacklo_epi32(acc0, _mm256_setzero_si256());
__m256i acc0_hi = _mm256_unpackhi_epi32(acc0, _mm256_setzero_si256());
__m256i mu1lo_lo = _mm256_unpacklo_epi32(mu1_lo, _mm256_setzero_si256());
__m256i mu1lo_hi = _mm256_unpackhi_epi32(mu1_lo, _mm256_setzero_si256());
__m256i mu1hi_lo = _mm256_unpacklo_epi32(mu1_hi, _mm256_setzero_si256());
__m256i mu1hi_hi = _mm256_unpackhi_epi32(mu1_hi, _mm256_setzero_si256());
mu1lo_lo = _mm256_mul_epu32(mu1lo_lo, acc0_lo);
mu1lo_hi = _mm256_mul_epu32(mu1lo_hi, acc0_hi);
mu1lo_lo = _mm256_srli_epi64(_mm256_add_epi64(mu1lo_lo, _mm256_set1_epi64x(0x80000000)), 32);
mu1lo_hi = _mm256_srli_epi64(_mm256_add_epi64(mu1lo_hi, _mm256_set1_epi64x(0x80000000)), 32);
acc0_lo = _mm256_mul_epu32(acc0_lo, acc0_lo);
acc0_hi = _mm256_mul_epu32(acc0_hi, acc0_hi);
acc0_lo = _mm256_srli_epi64(_mm256_add_epi64(acc0_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc0_hi = _mm256_srli_epi64(_mm256_add_epi64(acc0_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc1_lo = _mm256_unpacklo_epi32(acc1, _mm256_setzero_si256());
__m256i acc1_hi = _mm256_unpackhi_epi32(acc1, _mm256_setzero_si256());
mu1hi_lo = _mm256_mul_epu32(mu1hi_lo, acc1_lo);
mu1hi_hi = _mm256_mul_epu32(mu1hi_hi, acc1_hi);
mu1hi_lo = _mm256_srli_epi64(_mm256_add_epi64(mu1hi_lo, _mm256_set1_epi64x(0x80000000)), 32);
mu1hi_hi = _mm256_srli_epi64(_mm256_add_epi64(mu1hi_hi, _mm256_set1_epi64x(0x80000000)), 32);
acc1_lo = _mm256_mul_epu32(acc1_lo, acc1_lo);
acc1_hi = _mm256_mul_epu32(acc1_hi, acc1_hi);
acc1_lo = _mm256_srli_epi64(_mm256_add_epi64(acc1_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc1_hi = _mm256_srli_epi64(_mm256_add_epi64(acc1_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc0_sq = _mm256_blend_epi32(acc0_lo, _mm256_slli_si256(acc0_hi, 4), 0xAA);
acc0_sq = _mm256_shuffle_epi32(acc0_sq, 0xD8);
__m256i acc1_sq = _mm256_blend_epi32(acc1_lo, _mm256_slli_si256(acc1_hi, 4), 0xAA);
acc1_sq = _mm256_shuffle_epi32(acc1_sq, 0xD8);
mu2sq_lo = acc0_sq;
mu2sq_hi = acc1_sq;
mu1mu2_lo = _mm256_blend_epi32(mu1lo_lo, _mm256_slli_si256(mu1lo_hi, 4), 0xAA);
mu1mu2_lo = _mm256_shuffle_epi32(mu1mu2_lo, 0xD8);
mu1mu2_hi = _mm256_blend_epi32(mu1hi_lo, _mm256_slli_si256(mu1hi_hi, 4), 0xAA);
mu1mu2_hi = _mm256_shuffle_epi32(mu1mu2_hi, 0xD8);
}
// filter horizontally ref
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt[fwidth / 2]);
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + 0))), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + 4))), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + 8))), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + 12))), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj - fwidth / 2 + fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj - fwidth / 2 + fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj - fwidth / 2 + fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj - fwidth / 2 + fj + 12))), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + fwidth / 2 - fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + fwidth / 2 - fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + fwidth / 2 - fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref + jj + fwidth / 2 - fj + 12))), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
__m256i mask1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
// pack acc0,acc1,acc2,acc3 to acc0, acc1
acc1 = _mm256_slli_si256(acc1, 4);
acc1 = _mm256_blend_epi32(acc0, acc1, 0xAA);
acc0 = _mm256_permutevar8x32_epi32(acc1, mask1);
acc3 = _mm256_slli_si256(acc3, 4);
acc3 = _mm256_blend_epi32(acc2, acc3, 0xAA);
acc1 = _mm256_permutevar8x32_epi32(acc3, mask1);
acc0 = _mm256_sub_epi32(acc0, mu1sq_lo);
acc1 = _mm256_sub_epi32(acc1, mu1sq_hi);
_mm256_storeu_si256((__m256i*) & xx[0], acc0);
_mm256_storeu_si256((__m256i*) & xx[8], acc1);
}
// filter horizontally dis
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt[fwidth / 2]);
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + 0))), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + 4))), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + 8))), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + 12))), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj - fwidth / 2 + fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj - fwidth / 2 + fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj - fwidth / 2 + fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj - fwidth / 2 + fj + 12))), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + fwidth / 2 - fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + fwidth / 2 - fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + fwidth / 2 - fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.dis + jj + fwidth / 2 - fj + 12))), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
__m256i mask1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
// pack acc0,acc1,acc2,acc3 to acc0, acc1
acc1 = _mm256_slli_si256(acc1, 4);
acc1 = _mm256_blend_epi32(acc0, acc1, 0xAA);
acc0 = _mm256_permutevar8x32_epi32(acc1, mask1);
acc3 = _mm256_slli_si256(acc3, 4);
acc3 = _mm256_blend_epi32(acc2, acc3, 0xAA);
acc1 = _mm256_permutevar8x32_epi32(acc3, mask1);
acc0 = _mm256_sub_epi32(acc0, mu2sq_lo);
acc1 = _mm256_sub_epi32(acc1, mu2sq_hi);
_mm256_storeu_si256((__m256i*) & yy[0], _mm256_max_epi32(acc0, _mm256_setzero_si256()));
_mm256_storeu_si256((__m256i*) & yy[8], _mm256_max_epi32(acc1, _mm256_setzero_si256()));
}
// filter horizontally ref_dis, producing 16 samples
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt[fwidth / 2]);
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + 0))), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + 4))), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + 8))), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + 12))), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj - fwidth / 2 + fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj - fwidth / 2 + fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj - fwidth / 2 + fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj - fwidth / 2 + fj + 12))), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + fwidth / 2 - fj + 0))), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + fwidth / 2 - fj + 4))), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + fwidth / 2 - fj + 8))), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_cvtepu32_epi64(_mm_loadu_si128((__m128i*)(buf.tmp.ref_dis + jj + fwidth / 2 - fj + 12))), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
__m256i mask1 = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
// pack acc0,acc1,acc2,acc3 to acc0, acc1
acc1 = _mm256_slli_si256(acc1, 4);
acc1 = _mm256_blend_epi32(acc0, acc1, 0xAA);
acc0 = _mm256_permutevar8x32_epi32(acc1, mask1);
acc3 = _mm256_slli_si256(acc3, 4);
acc3 = _mm256_blend_epi32(acc2, acc3, 0xAA);
acc1 = _mm256_permutevar8x32_epi32(acc3, mask1);
acc0 = _mm256_sub_epi32(acc0, mu1mu2_lo);
acc1 = _mm256_sub_epi32(acc1, mu1mu2_hi);
_mm256_storeu_si256((__m256i*) & xy[0], acc0);
_mm256_storeu_si256((__m256i*) & xy[8], acc1);
}
for (unsigned int j = 0; j < 16; j++) {
int32_t sigma1_sq = xx[j];
int32_t sigma2_sq = yy[j];
int32_t sigma12 = xy[j];
if (sigma1_sq >= sigma_nsq) {
/**
* log values are taken from the look-up table generated by
* log_generate() function which is called in integer_combo_threadfunc
* den_val in float is log2(1 + sigma1_sq/2)
* here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
* multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
* x because best 16 bits are taken
*/
accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
if (sigma12 > 0 && sigma2_sq > 0) {
/**
* In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
*
* In Fixed-point the above is converted to
* numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
*/
const double eps = 65536 * 1.0e-10;
double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
int32_t sv_sq = sigma2_sq - g * sigma12;
sv_sq = (uint32_t)(MAX(sv_sq, 0));
g = MIN(g, vif_enhn_gain_limit);
uint32_t numer1 = (sv_sq + sigma_nsq);
int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
}
}
else {
accum_num_non_log += sigma2_sq;
accum_den_non_log++;
}
}
}
if ((n << 4) != w) {
VifResiduals residuals =
vif_compute_line_residuals(s, n << 4, w, scale);
accum_num_log += residuals.accum_num_log;
accum_den_log += residuals.accum_den_log;
accum_num_non_log += residuals.accum_num_non_log;
accum_den_non_log += residuals.accum_den_non_log;
}
}
num[0] = accum_num_log / 2048.0 + (accum_den_non_log - ((accum_num_non_log) / 16384.0) / (65025.0));
den[0] = accum_den_log / 2048.0 + accum_den_non_log;
}