in libvmaf/src/feature/x86/vif_avx2.c [121:547]
void vif_statistic_8_avx2(struct VifPublicState *s, float *num, float *den, unsigned w, unsigned h) {
assert(vif_filter1d_width[0] == 17);
static const unsigned fwidth = 17;
const uint16_t *vif_filt_s0 = vif_filter1d_table[0];
VifBuffer buf = s->buf;
//float equivalent of 2. (2 * 65536)
static const int32_t sigma_nsq = 65536 << 1;
double vif_enhn_gain_limit = s->vif_enhn_gain_limit;
int64_t accum_num_log = 0;
int64_t accum_den_log = 0;
int64_t accum_num_non_log = 0;
int64_t accum_den_non_log = 0;
uint16_t *log2_table = s->log2_table;
// variables used for 16 sample block vif computation
ALIGNED(32) uint32_t xx[16];
ALIGNED(32) uint32_t yy[16];
ALIGNED(32) uint32_t xy[16];
// loop on row, each iteration produces one line of output
for (unsigned i = 0; i < h; ++i) {
// Filter vertically
// First consider all blocks of 16 elements until it's not possible anymore
unsigned n = w >> 4;
for (unsigned jj = 0; jj < n << 4; jj += 16) {
__m256i accum_ref_left, accum_ref_right;
__m256i accum_dis_left, accum_dis_right;
__m256i accum_ref_dis_left, accum_ref_dis_right;
__m256i accum_mu2_left, accum_mu2_right;
__m256i accum_mu1_left, accum_mu1_right;
__m256i f0 = _mm256_set1_epi16(vif_filt_s0[fwidth / 2]);
__m256i r0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.ref) + (buf.stride * i) + jj)));
__m256i d0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.dis) + (buf.stride * i) + jj)));
// filtered r,d
multiply2(accum_mu1_left, accum_mu1_right, r0, f0);
multiply2(accum_mu2_left, accum_mu2_right, d0, f0);
// filtered(r * r, d * d, r * d)
multiply3(accum_ref_left, accum_ref_right, r0, r0, f0);
multiply3(accum_dis_left, accum_dis_right, d0, d0, f0);
multiply3(accum_ref_dis_left, accum_ref_dis_right, d0, r0, f0);
for (unsigned int tap = 0; tap < fwidth / 2; tap++) {
int ii_check = i - fwidth / 2 + tap;
int ii_check_1 = i + fwidth / 2 - tap;
__m256i f0 = _mm256_set1_epi16(vif_filt_s0[tap]);
__m256i r0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.ref) + (buf.stride * ii_check) + jj)));
__m256i r1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.ref) + (buf.stride * (ii_check_1)) + jj)));
__m256i d0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.dis) + (buf.stride * ii_check) + jj)));
__m256i d1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(((uint8_t*)buf.dis) + (buf.stride * (ii_check_1)) + jj)));
// accumulate filtered r,d
multiply2_and_accumulate(accum_mu1_left, accum_mu1_right, r0, r1, f0);
multiply2_and_accumulate(accum_mu2_left, accum_mu2_right, d0, d1, f0);
// accumulate filtered(r * r, d * d, r * d)
multiply3_and_accumulate(accum_ref_left, accum_ref_right, r0, r0, f0);
multiply3_and_accumulate(accum_ref_left, accum_ref_right, r1, r1, f0);
multiply3_and_accumulate(accum_dis_left, accum_dis_right, d0, d0, f0);
multiply3_and_accumulate(accum_dis_left, accum_dis_right, d1, d1, f0);
multiply3_and_accumulate(accum_ref_dis_left, accum_ref_dis_right, d0, r0, f0);
multiply3_and_accumulate(accum_ref_dis_left, accum_ref_dis_right, d1, r1, f0);
}
__m256i x = _mm256_set1_epi32(128);
accum_mu1_left = _mm256_add_epi32(accum_mu1_left, x);
accum_mu1_right = _mm256_add_epi32(accum_mu1_right, x);
accum_mu2_left = _mm256_add_epi32(accum_mu2_left, x);
accum_mu2_right = _mm256_add_epi32(accum_mu2_right, x);
accum_mu1_left = _mm256_srli_epi32(accum_mu1_left, 0x08);
accum_mu1_right = _mm256_srli_epi32(accum_mu1_right, 0x08);
accum_mu2_left = _mm256_srli_epi32(accum_mu2_left, 0x08);
accum_mu2_right = _mm256_srli_epi32(accum_mu2_right, 0x08);
shuffle_and_save(buf.tmp.mu1 + jj, accum_mu1_left, accum_mu1_right);
shuffle_and_save(buf.tmp.mu2 + jj, accum_mu2_left, accum_mu2_right);
shuffle_and_save(buf.tmp.ref + jj, accum_ref_left, accum_ref_right);
shuffle_and_save(buf.tmp.dis + jj, accum_dis_left, accum_dis_right);
shuffle_and_save(buf.tmp.ref_dis + jj, accum_ref_dis_left, accum_ref_dis_right);
}
// Then consider the remaining elements individually
for (unsigned j = n << 4; j < w; ++j) {
uint32_t accum_mu1 = 0;
uint32_t accum_mu2 = 0;
uint64_t accum_ref = 0;
uint64_t accum_dis = 0;
uint64_t accum_ref_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi) {
int ii = i - fwidth / 2;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt_s0[fi];
const uint8_t *ref = (uint8_t*)buf.ref;
const uint8_t *dis = (uint8_t*)buf.dis;
uint16_t imgcoeff_ref = ref[ii_check * buf.stride + j];
uint16_t imgcoeff_dis = dis[ii_check * buf.stride + j];
uint32_t img_coeff_ref = fcoeff * (uint32_t)imgcoeff_ref;
uint32_t img_coeff_dis = fcoeff * (uint32_t)imgcoeff_dis;
accum_mu1 += img_coeff_ref;
accum_mu2 += img_coeff_dis;
accum_ref += img_coeff_ref * (uint64_t)imgcoeff_ref;
accum_dis += img_coeff_dis * (uint64_t)imgcoeff_dis;
accum_ref_dis += img_coeff_ref * (uint64_t)imgcoeff_dis;
}
buf.tmp.mu1[j] = (accum_mu1 + 128) >> 8;
buf.tmp.mu2[j] = (accum_mu2 + 128) >> 8;
buf.tmp.ref[j] = accum_ref;
buf.tmp.dis[j] = accum_dis;
buf.tmp.ref_dis[j] = accum_ref_dis;
}
PADDING_SQ_DATA(buf, w, fwidth / 2);
//HORIZONTAL
for (unsigned j = 0; j < n << 4; j += 16) {
__m256i mu1_lo;
__m256i mu1_hi;
__m256i mu1sq_lo; // shuffled
__m256i mu1sq_hi; // shuffled
__m256i mu2sq_lo; // shuffled
__m256i mu2sq_hi; // shuffled
__m256i mu1mu2_lo; // shuffled
__m256i mu1mu2_hi; // shuffled
// compute mu1 filtered, mu1*mu1 filterd
{
__m256i fq = _mm256_set1_epi32(vif_filt_s0[fwidth / 2]);
mu1_lo = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j + 0)), fq);
mu1_hi = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j + 8)), fq);
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi32(vif_filt_s0[fj]);
mu1_lo = _mm256_add_epi64(mu1_lo, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j - fwidth / 2 + fj + 0)), fq));
mu1_hi = _mm256_add_epi64(mu1_hi, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j - fwidth / 2 + fj + 8)), fq));
mu1_lo = _mm256_add_epi64(mu1_lo, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j + fwidth / 2 - fj + 0)), fq));
mu1_hi = _mm256_add_epi64(mu1_hi, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu1 + j + fwidth / 2 - fj + 8)), fq));
}
__m256i acc0_lo = _mm256_unpacklo_epi32(mu1_lo, _mm256_setzero_si256());
__m256i acc0_hi = _mm256_unpackhi_epi32(mu1_lo, _mm256_setzero_si256());
acc0_lo = _mm256_mul_epu32(acc0_lo, acc0_lo);
acc0_hi = _mm256_mul_epu32(acc0_hi, acc0_hi);
acc0_lo = _mm256_srli_epi64(_mm256_add_epi64(acc0_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc0_hi = _mm256_srli_epi64(_mm256_add_epi64(acc0_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc1_lo = _mm256_unpacklo_epi32(mu1_hi, _mm256_setzero_si256());
__m256i acc1_hi = _mm256_unpackhi_epi32(mu1_hi, _mm256_setzero_si256());
acc1_lo = _mm256_mul_epu32(acc1_lo, acc1_lo);
acc1_hi = _mm256_mul_epu32(acc1_hi, acc1_hi);
acc1_lo = _mm256_srli_epi64(_mm256_add_epi64(acc1_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc1_hi = _mm256_srli_epi64(_mm256_add_epi64(acc1_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc0_sq = _mm256_blend_epi32(acc0_lo, _mm256_slli_si256(acc0_hi, 4), 0xAA);
__m256i acc1_sq = _mm256_blend_epi32(acc1_lo, _mm256_slli_si256(acc1_hi, 4), 0xAA);
mu1sq_lo = acc0_sq;
mu1sq_hi = acc1_sq;
}
// compute mu2 filtered, mu2*mu2 filtered, mu1*mu2 filtered
{
__m256i fq = _mm256_set1_epi32(vif_filt_s0[fwidth / 2]);
__m256i acc0 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j + 0)), fq);
__m256i acc1 = _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j + 8)), fq);
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi32(vif_filt_s0[fj]);
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j - fwidth / 2 + fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j - fwidth / 2 + fj + 8)), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j + fwidth / 2 - fj + 0)), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mullo_epi32(_mm256_loadu_si256((__m256i*)(buf.tmp.mu2 + j + fwidth / 2 - fj + 8)), fq));
}
__m256i acc0_lo = _mm256_unpacklo_epi32(acc0, _mm256_setzero_si256());
__m256i acc0_hi = _mm256_unpackhi_epi32(acc0, _mm256_setzero_si256());
__m256i mu1lo_lo = _mm256_unpacklo_epi32(mu1_lo, _mm256_setzero_si256());
__m256i mu1lo_hi = _mm256_unpackhi_epi32(mu1_lo, _mm256_setzero_si256());
__m256i mu1hi_lo = _mm256_unpacklo_epi32(mu1_hi, _mm256_setzero_si256());
__m256i mu1hi_hi = _mm256_unpackhi_epi32(mu1_hi, _mm256_setzero_si256());
mu1lo_lo = _mm256_mul_epu32(mu1lo_lo, acc0_lo);
mu1lo_hi = _mm256_mul_epu32(mu1lo_hi, acc0_hi);
mu1lo_lo = _mm256_srli_epi64(_mm256_add_epi64(mu1lo_lo, _mm256_set1_epi64x(0x80000000)), 32);
mu1lo_hi = _mm256_srli_epi64(_mm256_add_epi64(mu1lo_hi, _mm256_set1_epi64x(0x80000000)), 32);
acc0_lo = _mm256_mul_epu32(acc0_lo, acc0_lo);
acc0_hi = _mm256_mul_epu32(acc0_hi, acc0_hi);
acc0_lo = _mm256_srli_epi64(_mm256_add_epi64(acc0_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc0_hi = _mm256_srli_epi64(_mm256_add_epi64(acc0_hi, _mm256_set1_epi64x(0x80000000)), 32);
__m256i acc1_lo = _mm256_unpacklo_epi32(acc1, _mm256_setzero_si256());
__m256i acc1_hi = _mm256_unpackhi_epi32(acc1, _mm256_setzero_si256());
mu1hi_lo = _mm256_mul_epu32(mu1hi_lo, acc1_lo);
mu1hi_hi = _mm256_mul_epu32(mu1hi_hi, acc1_hi);
mu1hi_lo = _mm256_srli_epi64(_mm256_add_epi64(mu1hi_lo, _mm256_set1_epi64x(0x80000000)), 32);
mu1hi_hi = _mm256_srli_epi64(_mm256_add_epi64(mu1hi_hi, _mm256_set1_epi64x(0x80000000)), 32);
acc1_lo = _mm256_mul_epu32(acc1_lo, acc1_lo);
acc1_hi = _mm256_mul_epu32(acc1_hi, acc1_hi);
acc1_lo = _mm256_srli_epi64(_mm256_add_epi64(acc1_lo, _mm256_set1_epi64x(0x80000000)), 32);
acc1_hi = _mm256_srli_epi64(_mm256_add_epi64(acc1_hi, _mm256_set1_epi64x(0x80000000)), 32);
mu2sq_lo = _mm256_blend_epi32(acc0_lo, _mm256_slli_si256(acc0_hi, 4), 0xAA);
mu2sq_hi = _mm256_blend_epi32(acc1_lo, _mm256_slli_si256(acc1_hi, 4), 0xAA);
mu1mu2_lo = _mm256_blend_epi32(mu1lo_lo, _mm256_slli_si256(mu1lo_hi, 4), 0xAA);
mu1mu2_hi = _mm256_blend_epi32(mu1hi_lo, _mm256_slli_si256(mu1hi_hi, 4), 0xAA);
}
// compute yy, that is refsq filtered - mu1 * mu1
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fwidth / 2]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 8));
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fj]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 8));
__m256i m2 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 0));
__m256i m3 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 8));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m2, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m2, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m3, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m3, _mm256_setzero_si256()), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
acc0 = _mm256_blend_epi32(acc0, _mm256_slli_si256(acc1, 4), 0xAA);
acc1 = _mm256_blend_epi32(acc2, _mm256_slli_si256(acc3, 4), 0xAA);
//mu1sq is shuffled
acc0 = _mm256_sub_epi32(acc0, mu1sq_lo);
acc1 = _mm256_sub_epi32(acc1, mu1sq_hi);
acc0 = _mm256_shuffle_epi32(acc0, 0xD8);
acc1 = _mm256_shuffle_epi32(acc1, 0xD8);
_mm256_storeu_si256((__m256i*)& xx[0], acc0);
_mm256_storeu_si256((__m256i*)& xx[8], acc1);
}
// compute yy, that is dissq filtered - mu1 * mu1
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fwidth / 2]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 8));
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fj]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 8));
__m256i m2 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 0));
__m256i m3 = _mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 8));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m2, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m2, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m3, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m3, _mm256_setzero_si256()), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
acc0 = _mm256_blend_epi32(acc0, _mm256_slli_si256(acc1, 4), 0xAA);
acc1 = _mm256_blend_epi32(acc2, _mm256_slli_si256(acc3, 4), 0xAA);
//mu2sq is already shuffled
acc0 = _mm256_sub_epi32(acc0, mu2sq_lo);
acc1 = _mm256_sub_epi32(acc1, mu2sq_hi);
acc0 = _mm256_shuffle_epi32(acc0, 0xD8);
acc1 = _mm256_shuffle_epi32(acc1, 0xD8);
_mm256_storeu_si256((__m256i*) & yy[0], _mm256_max_epi32(acc0, _mm256_setzero_si256()));
_mm256_storeu_si256((__m256i*) & yy[8], _mm256_max_epi32(acc1, _mm256_setzero_si256()));
}
// compute xy, that is ref*dis filtered - mu1 * mu2
{
__m256i rounder = _mm256_set1_epi64x(0x8000);
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fwidth / 2]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 8));
__m256i acc0 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc1 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
__m256i acc2 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
__m256i acc3 = _mm256_add_epi64(rounder, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m256i fq = _mm256_set1_epi64x(vif_filt_s0[fj]);
__m256i m0 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 0));
__m256i m1 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 8));
__m256i m2 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 0));
__m256i m3 = _mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 8));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m0, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m0, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m1, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m1, _mm256_setzero_si256()), fq));
acc0 = _mm256_add_epi64(acc0, _mm256_mul_epu32(_mm256_unpacklo_epi32(m2, _mm256_setzero_si256()), fq));
acc1 = _mm256_add_epi64(acc1, _mm256_mul_epu32(_mm256_unpackhi_epi32(m2, _mm256_setzero_si256()), fq));
acc2 = _mm256_add_epi64(acc2, _mm256_mul_epu32(_mm256_unpacklo_epi32(m3, _mm256_setzero_si256()), fq));
acc3 = _mm256_add_epi64(acc3, _mm256_mul_epu32(_mm256_unpackhi_epi32(m3, _mm256_setzero_si256()), fq));
}
acc0 = _mm256_srli_epi64(acc0, 16);
acc1 = _mm256_srli_epi64(acc1, 16);
acc2 = _mm256_srli_epi64(acc2, 16);
acc3 = _mm256_srli_epi64(acc3, 16);
acc0 = _mm256_blend_epi32(acc0, _mm256_slli_si256(acc1, 4), 0xAA);
acc1 = _mm256_blend_epi32(acc2, _mm256_slli_si256(acc3, 4), 0xAA);
//mu1sq is already shuffled
acc0 = _mm256_sub_epi32(acc0, mu1mu2_lo);
acc1 = _mm256_sub_epi32(acc1, mu1mu2_hi);
acc0 = _mm256_shuffle_epi32(acc0, 0xD8);
acc1 = _mm256_shuffle_epi32(acc1, 0xD8);
_mm256_storeu_si256((__m256i*) & xy[0], acc0);
_mm256_storeu_si256((__m256i*) & xy[8], acc1);
}
for (unsigned int b = 0; b < 16; b++) {
int32_t sigma1_sq = xx[b];
int32_t sigma2_sq = yy[b];
int32_t sigma12 = xy[b];
if (sigma1_sq >= sigma_nsq) {
/**
* log values are taken from the look-up table generated by
* log_generate() function which is called in integer_combo_threadfunc
* den_val in float is log2(1 + sigma1_sq/2)
* here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
* multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
* x because best 16 bits are taken
*/
accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
if (sigma12 > 0 && sigma2_sq > 0)
{
// num_val = log2f(1.0f + (g * g * sigma1_sq) / (sv_sq + sigma_nsq));
/**
* In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
*
* In Fixed-point the above is converted to
* numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
*/
const double eps = 65536 * 1.0e-10;
double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
int32_t sv_sq = sigma2_sq - g * sigma12;
sv_sq = (uint32_t)(MAX(sv_sq, 0));
g = MIN(g, vif_enhn_gain_limit);
uint32_t numer1 = (sv_sq + sigma_nsq);
int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
}
}
else {
accum_num_non_log += sigma2_sq;
accum_den_non_log++;
}
}
}
if ((n << 4) != w) {
VifResiduals residuals = vif_compute_line_residuals(s, n << 4, w, 0);
accum_num_log += residuals.accum_num_log;
accum_den_log += residuals.accum_den_log;
accum_num_non_log += residuals.accum_num_non_log;
accum_den_non_log += residuals.accum_den_non_log;
}
}
//log has to be divided by 2048 as log_value = log2(i*2048) i=16384 to 65535
//num[0] = accum_num_log / 2048.0 + (accum_den_non_log - (accum_num_non_log / 65536.0) / (255.0*255.0));
//den[0] = accum_den_log / 2048.0 + accum_den_non_log;
//changed calculation to increase performance
num[0] = accum_num_log / 2048.0 + (accum_den_non_log - ((accum_num_non_log) / 16384.0) / (65025.0));
den[0] = accum_den_log / 2048.0 + accum_den_non_log;
}