in libvmaf/src/feature/x86/vif_avx512.c [387:757]
void vif_statistic_16_avx512(struct VifPublicState *s, float *num, float *den, unsigned w, unsigned h, int bpc, int scale) {
const unsigned fwidth = vif_filter1d_width[scale];
const uint16_t *vif_filt = vif_filter1d_table[scale];
VifBuffer buf = s->buf;
const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
int fwidth_half = fwidth >> 1;
int32_t add_shift_round_VP, shift_VP;
int32_t add_shift_round_VP_sq, shift_VP_sq;
const uint16_t *log2_table = s->log2_table;
double vif_enhn_gain_limit = s->vif_enhn_gain_limit;
__m512i mask2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
Residuals512 residuals;
residuals.maccum_den_log = _mm512_setzero_si512();
residuals.maccum_num_log = _mm512_setzero_si512();
residuals.maccum_den_non_log = _mm512_setzero_si512();
residuals.maccum_num_non_log = _mm512_setzero_si512();
int64_t accum_num_log = 0;
int64_t accum_den_log = 0;
int64_t accum_num_non_log = 0;
int64_t accum_den_non_log = 0;
if (scale == 0)
{
shift_VP = bpc;
add_shift_round_VP = 1 << (bpc - 1);
shift_VP_sq = (bpc - 8) * 2;
add_shift_round_VP_sq = (bpc == 8) ? 0 : 1 << (shift_VP_sq - 1);
}
else
{
shift_VP = 16;
add_shift_round_VP = 32768;
shift_VP_sq = 16;
add_shift_round_VP_sq = 32768;
}
__m512i addnum64 = _mm512_set1_epi64(add_shift_round_VP_sq);
__m512i addnum = _mm512_set1_epi32(add_shift_round_VP);
uint16_t *ref = buf.ref;
uint16_t *dis = buf.dis;
for (unsigned i = 0; i < h; ++i)
{
//VERTICAL
int ii = i - fwidth_half;
int n = w >> 5;
for (int j = 0; j < n << 5; j = j + 32)
{
__m512i mask3 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0); //first half of 512
__m512i mask4 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); //second half of 512
int ii_check = ii;
__m512i accumr_lo, accumr_hi, accumd_lo, accumd_hi, rmul1, rmul2,
dmul1, dmul2, accumref1, accumref2, accumref3, accumref4,
accumrefdis1, accumrefdis2, accumrefdis3, accumrefdis4,
accumdis1, accumdis2, accumdis3, accumdis4;
accumr_lo = accumr_hi = accumd_lo = accumd_hi = rmul1 = rmul2 = dmul1 = dmul2 = accumref1 = accumref2 = accumref3 = accumref4 = accumrefdis1 = accumrefdis2 = accumrefdis3 =
accumrefdis4 = accumdis1 = accumdis2 = accumdis3 = accumdis4 = _mm512_setzero_si512();
for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi)
{
const uint16_t fcoeff = vif_filt[fi];
__m512i f1 = _mm512_set1_epi16(fcoeff);
__m512i ref1 = _mm512_loadu_si512(
(__m512i*)(ref + (ii_check * stride) + j));
__m512i dis1 = _mm512_loadu_si512(
(__m512i*)(dis + (ii_check * stride) + j));
__m512i result2 = _mm512_mulhi_epu16(ref1, f1);
__m512i result2lo = _mm512_mullo_epi16(ref1, f1);
__m512i rmult1 = _mm512_unpacklo_epi16(result2lo, result2);
__m512i rmult2 = _mm512_unpackhi_epi16(result2lo, result2);
rmul1 = _mm512_permutex2var_epi64(rmult1, mask3, rmult2);
rmul2 = _mm512_permutex2var_epi64(rmult1, mask4, rmult2);
accumr_lo = _mm512_add_epi32(accumr_lo, rmul1);
accumr_hi = _mm512_add_epi32(accumr_hi, rmul2);
__m512i d0 = _mm512_mulhi_epu16(dis1, f1);
__m512i d0lo = _mm512_mullo_epi16(dis1, f1);
__m512i dmult1 = _mm512_unpacklo_epi16(d0lo, d0);
__m512i dmult2 = _mm512_unpackhi_epi16(d0lo, d0);
dmul1 = _mm512_permutex2var_epi64(dmult1, mask3, dmult2);
dmul2 = _mm512_permutex2var_epi64(dmult1, mask4, dmult2);
accumd_lo = _mm512_add_epi32(accumd_lo, dmul1);
accumd_hi = _mm512_add_epi32(accumd_hi, dmul2);
__m512i sg0 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(rmul1));
__m512i sg1 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(rmul1, 1));
__m512i sg2 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(rmul2));
__m512i sg3 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(rmul2, 1));
__m128i l0 = _mm512_castsi512_si128(ref1);
__m128i l1 = _mm512_extracti32x4_epi32(ref1, 1);
__m128i l2 = _mm512_extracti32x4_epi32(ref1, 2);
__m128i l3 = _mm512_extracti32x4_epi32(ref1, 3);
accumref1 = _mm512_add_epi64(accumref1,
_mm512_mul_epu32(sg0, _mm512_cvtepu16_epi64(l0)));
accumref2 = _mm512_add_epi64(accumref2,
_mm512_mul_epu32(sg2, _mm512_cvtepu16_epi64(l2)));
accumref3 = _mm512_add_epi64(accumref3,
_mm512_mul_epu32(sg1, _mm512_cvtepu16_epi64(l1)));
accumref4 = _mm512_add_epi64(accumref4,
_mm512_mul_epu32(sg3, _mm512_cvtepu16_epi64(l3)));
l0 = _mm512_castsi512_si128(dis1);
l1 = _mm512_extracti32x4_epi32(dis1, 1);
l2 = _mm512_extracti32x4_epi32(dis1, 2);
l3 = _mm512_extracti32x4_epi32(dis1, 3);
accumrefdis1 = _mm512_add_epi64(accumrefdis1,
_mm512_mul_epu32(sg0, _mm512_cvtepu16_epi64(l0)));
accumrefdis2 = _mm512_add_epi64(accumrefdis2,
_mm512_mul_epu32(sg2, _mm512_cvtepu16_epi64(l2)));
accumrefdis3 = _mm512_add_epi64(accumrefdis3,
_mm512_mul_epu32(sg1, _mm512_cvtepu16_epi64(l1)));
accumrefdis4 = _mm512_add_epi64(accumrefdis4,
_mm512_mul_epu32(sg3, _mm512_cvtepu16_epi64(l3)));
__m512i sd0 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(dmul1));
__m512i sd1 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(dmul1, 1));
__m512i sd2 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(dmul2));
__m512i sd3 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(dmul2, 1));
accumdis1 = _mm512_add_epi64(accumdis1,
_mm512_mul_epu32(sd0, _mm512_cvtepu16_epi64(l0)));
accumdis2 = _mm512_add_epi64(accumdis2,
_mm512_mul_epu32(sd2, _mm512_cvtepu16_epi64(l2)));
accumdis3 = _mm512_add_epi64(accumdis3,
_mm512_mul_epu32(sd1, _mm512_cvtepu16_epi64(l1)));
accumdis4 = _mm512_add_epi64(accumdis4,
_mm512_mul_epu32(sd3, _mm512_cvtepu16_epi64(l3)));
}
accumr_lo = _mm512_add_epi32(accumr_lo, addnum);
accumr_hi = _mm512_add_epi32(accumr_hi, addnum);
accumr_lo = _mm512_srli_epi32(accumr_lo, shift_VP);
accumr_hi = _mm512_srli_epi32(accumr_hi, shift_VP);
_mm512_storeu_si512((__m512i*)(buf.tmp.mu1 + j), accumr_lo);
_mm512_storeu_si512((__m512i*)(buf.tmp.mu1 + j + 16), accumr_hi);
accumd_lo = _mm512_add_epi32(accumd_lo, addnum);
accumd_hi = _mm512_add_epi32(accumd_hi, addnum);
accumd_lo = _mm512_srli_epi32(accumd_lo, shift_VP);
accumd_hi = _mm512_srli_epi32(accumd_hi, shift_VP);
_mm512_storeu_si512((__m512i*)(buf.tmp.mu2 + j), accumd_lo);
_mm512_storeu_si512((__m512i*)(buf.tmp.mu2 + j + 16), accumd_hi);
accumref1 = _mm512_add_epi64(accumref1, addnum64);
accumref2 = _mm512_add_epi64(accumref2, addnum64);
accumref3 = _mm512_add_epi64(accumref3, addnum64);
accumref4 = _mm512_add_epi64(accumref4, addnum64);
accumref1 = _mm512_srli_epi64(accumref1, shift_VP_sq);
accumref2 = _mm512_srli_epi64(accumref2, shift_VP_sq);
accumref3 = _mm512_srli_epi64(accumref3, shift_VP_sq);
accumref4 = _mm512_srli_epi64(accumref4, shift_VP_sq);
_mm512_storeu_si512((__m512i*)(buf.tmp.ref + j),
_mm512_permutex2var_epi32(accumref1, mask2, accumref3));
_mm512_storeu_si512((__m512i*)(buf.tmp.ref + 16 + j),
_mm512_permutex2var_epi32(accumref2, mask2, accumref4));
accumrefdis1 = _mm512_add_epi64(accumrefdis1, addnum64);
accumrefdis2 = _mm512_add_epi64(accumrefdis2, addnum64);
accumrefdis3 = _mm512_add_epi64(accumrefdis3, addnum64);
accumrefdis4 = _mm512_add_epi64(accumrefdis4, addnum64);
accumrefdis1 = _mm512_srli_epi64(accumrefdis1, shift_VP_sq);
accumrefdis2 = _mm512_srli_epi64(accumrefdis2, shift_VP_sq);
accumrefdis3 = _mm512_srli_epi64(accumrefdis3, shift_VP_sq);
accumrefdis4 = _mm512_srli_epi64(accumrefdis4, shift_VP_sq);
_mm512_storeu_si512((__m512i*)(buf.tmp.ref_dis + j),
_mm512_permutex2var_epi32(accumrefdis1, mask2, accumrefdis3));
_mm512_storeu_si512((__m512i*)(buf.tmp.ref_dis + 16 + j),
_mm512_permutex2var_epi32(accumrefdis2, mask2, accumrefdis4));
accumdis1 = _mm512_add_epi64(accumdis1, addnum64);
accumdis2 = _mm512_add_epi64(accumdis2, addnum64);
accumdis3 = _mm512_add_epi64(accumdis3, addnum64);
accumdis4 = _mm512_add_epi64(accumdis4, addnum64);
accumdis1 = _mm512_srli_epi64(accumdis1, shift_VP_sq);
accumdis2 = _mm512_srli_epi64(accumdis2, shift_VP_sq);
accumdis3 = _mm512_srli_epi64(accumdis3, shift_VP_sq);
accumdis4 = _mm512_srli_epi64(accumdis4, shift_VP_sq);
_mm512_storeu_si512((__m512i*)(buf.tmp.dis + j),
_mm512_permutex2var_epi32(accumdis1, mask2, accumdis3));
_mm512_storeu_si512((__m512i*)(buf.tmp.dis + 16 + j),
_mm512_permutex2var_epi32(accumdis2, mask2, accumdis4));
}
for (unsigned j = n << 5; j < w; ++j)
{
uint32_t accum_mu1 = 0;
uint32_t accum_mu2 = 0;
uint64_t accum_ref = 0;
uint64_t accum_dis = 0;
uint64_t accum_ref_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi)
{
int ii = i - fwidth / 2;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt[fi];
const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
uint16_t *ref = buf.ref;
uint16_t *dis = buf.dis;
uint16_t imgcoeff_ref = ref[ii_check * stride + j];
uint16_t imgcoeff_dis = dis[ii_check * stride + j];
uint32_t img_coeff_ref = fcoeff * (uint32_t)imgcoeff_ref;
uint32_t img_coeff_dis = fcoeff * (uint32_t)imgcoeff_dis;
accum_mu1 += img_coeff_ref;
accum_mu2 += img_coeff_dis;
accum_ref += img_coeff_ref * (uint64_t)imgcoeff_ref;
accum_dis += img_coeff_dis * (uint64_t)imgcoeff_dis;
accum_ref_dis += img_coeff_ref * (uint64_t)imgcoeff_dis;
}
buf.tmp.mu1[j] = (uint16_t)((accum_mu1 + add_shift_round_VP) >> shift_VP);
buf.tmp.mu2[j] = (uint16_t)((accum_mu2 + add_shift_round_VP) >> shift_VP);
buf.tmp.ref[j] = (uint32_t)((accum_ref + add_shift_round_VP_sq) >> shift_VP_sq);
buf.tmp.dis[j] = (uint32_t)((accum_dis + add_shift_round_VP_sq) >> shift_VP_sq);
buf.tmp.ref_dis[j] = (uint32_t)((accum_ref_dis + add_shift_round_VP_sq) >> shift_VP_sq);
}
PADDING_SQ_DATA(buf, w, fwidth_half);
//HORIZONTAL
n = w >> 4;
for (int j = 0; j < n << 4; j = j + 16)
{
__m512i mu1sq;
__m512i mu2sq;
__m512i mu1mu2;
__m512i xx;
__m512i yy;
__m512i xy;
__m512i mask5 = _mm512_set_epi32(30, 28, 14, 12, 26, 24, 10, 8, 22, 20, 6, 4, 18, 16, 2, 0);
// compute mu1sq, mu2sq, mu1mu2
{
__m512i fq = _mm512_set1_epi32(vif_filt[fwidth / 2]);
__m512i acc0 = _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j + 0)), fq);
__m512i acc1 = _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j + 0)), fq);
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m512i fq = _mm512_set1_epi32(vif_filt[fj]);
acc0 = _mm512_add_epi64(acc0, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j - fwidth / 2 + fj + 0)), fq));
acc0 = _mm512_add_epi64(acc0, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j + fwidth / 2 - fj + 0)), fq));
acc1 = _mm512_add_epi64(acc1, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j - fwidth / 2 + fj + 0)), fq));
acc1 = _mm512_add_epi64(acc1, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j + fwidth / 2 - fj + 0)), fq));
}
__m512i mu1 = acc0;
__m512i acc0_lo_512 = _mm512_unpacklo_epi32(acc0, _mm512_setzero_si512());
__m512i acc0_hi_512 = _mm512_unpackhi_epi32(acc0, _mm512_setzero_si512());
acc0_lo_512 = _mm512_mul_epu32(acc0_lo_512, acc0_lo_512);
acc0_hi_512 = _mm512_mul_epu32(acc0_hi_512, acc0_hi_512);
acc0_lo_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0_lo_512, _mm512_set1_epi64(0x80000000)), 32);
acc0_hi_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0_hi_512, _mm512_set1_epi64(0x80000000)), 32);
mu1sq = _mm512_permutex2var_epi32(acc0_lo_512, mask5, acc0_hi_512);
__m512i acc0lo_512 = _mm512_unpacklo_epi32(acc1, _mm512_setzero_si512());
__m512i acc0hi_512 = _mm512_unpackhi_epi32(acc1, _mm512_setzero_si512());
__m512i mu1lo_512 = _mm512_unpacklo_epi32(mu1, _mm512_setzero_si512());
__m512i mu1hi_512 = _mm512_unpackhi_epi32(mu1, _mm512_setzero_si512());
mu1lo_512 = _mm512_mul_epu32(mu1lo_512, acc0lo_512);
mu1hi_512 = _mm512_mul_epu32(mu1hi_512, acc0hi_512);
mu1lo_512 = _mm512_srli_epi64(_mm512_add_epi64(mu1lo_512, _mm512_set1_epi64(0x80000000)), 32);
mu1hi_512 = _mm512_srli_epi64(_mm512_add_epi64(mu1hi_512, _mm512_set1_epi64(0x80000000)), 32);
mu1mu2 = _mm512_permutex2var_epi32(mu1lo_512, mask5, mu1hi_512);
acc0lo_512 = _mm512_mul_epu32(acc0lo_512, acc0lo_512);
acc0hi_512 = _mm512_mul_epu32(acc0hi_512, acc0hi_512);
acc0lo_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0lo_512, _mm512_set1_epi64(0x80000000)), 32);
acc0hi_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0hi_512, _mm512_set1_epi64(0x80000000)), 32);
mu2sq = _mm512_permutex2var_epi32(acc0lo_512, mask5, acc0hi_512);
}
// compute xx, yy, xy
{
__m512i rounder = _mm512_set1_epi64(0x8000);
__m512i fq = _mm512_set1_epi64(vif_filt[fwidth / 2]);
__m512i s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 0))); // 4
__m512i s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 8))); // 4
__m512i refsq_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
__m512i refsq_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 8))); // 4
__m512i dissq_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
__m512i dissq_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 8))); // 4
__m512i refdis_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
__m512i refdis_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));
for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
__m512i fq = _mm512_set1_epi64(vif_filt[fj]);
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 8))); // 4
refsq_lo = _mm512_add_epi64(refsq_lo, _mm512_mul_epu32(s0, fq));
refsq_hi = _mm512_add_epi64(refsq_hi, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 8))); // 4
refsq_lo = _mm512_add_epi64(refsq_lo, _mm512_mul_epu32(s0, fq));
refsq_hi = _mm512_add_epi64(refsq_hi, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 8))); // 4
dissq_lo = _mm512_add_epi64(dissq_lo, _mm512_mul_epu32(s0, fq));
dissq_hi = _mm512_add_epi64(dissq_hi, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 8))); // 4
dissq_lo = _mm512_add_epi64(dissq_lo, _mm512_mul_epu32(s0, fq));
dissq_hi = _mm512_add_epi64(dissq_hi, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 8))); // 4
refdis_lo = _mm512_add_epi64(refdis_lo, _mm512_mul_epu32(s0, fq));
refdis_hi = _mm512_add_epi64(refdis_hi, _mm512_mul_epu32(s2, fq));
s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 0))); // 4
s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 8))); // 4
refdis_lo = _mm512_add_epi64(refdis_lo, _mm512_mul_epu32(s0, fq));
refdis_hi = _mm512_add_epi64(refdis_hi, _mm512_mul_epu32(s2, fq));
}
refsq_lo = _mm512_srli_epi64(refsq_lo, 16);
refsq_hi = _mm512_srli_epi64(refsq_hi, 16);
__m512i refsq = _mm512_permutex2var_epi32(refsq_lo, mask2, refsq_hi);
xx = _mm512_sub_epi32(refsq, mu1sq);
dissq_lo = _mm512_srli_epi64(dissq_lo, 16);
dissq_hi = _mm512_srli_epi64(dissq_hi, 16);
__m512i dissq = _mm512_permutex2var_epi32(dissq_lo, mask2, dissq_hi);
yy = _mm512_max_epi32(_mm512_sub_epi32(dissq, mu2sq), _mm512_setzero_si512());
refdis_lo = _mm512_srli_epi64(refdis_lo, 16);
refdis_hi = _mm512_srli_epi64(refdis_hi, 16);
__m512i refdis = _mm512_permutex2var_epi32(refdis_lo, mask2, refdis_hi);
xy = _mm512_sub_epi32(refdis, mu1mu2);
}
vif_statistic_avx512(&residuals, xx, xy, yy, log2_table, vif_enhn_gain_limit);
}
if ((n << 4) != (int)w) {
VifResiduals residuals =
vif_compute_line_residuals(s, n << 4, w, scale);
accum_num_log += residuals.accum_num_log;
accum_den_log += residuals.accum_den_log;
accum_num_non_log += residuals.accum_num_non_log;
accum_den_non_log += residuals.accum_den_non_log;
}
}
accum_num_log += _mm512_reduce_add_epi64(residuals.maccum_num_log);
accum_den_log += _mm512_reduce_add_epi64(residuals.maccum_den_log);
accum_num_non_log += _mm512_reduce_add_epi64(residuals.maccum_num_non_log);
accum_den_non_log += _mm512_reduce_add_epi64(residuals.maccum_den_non_log);
/**
* In floating-point there are two types of numerator scores and denominator scores
* 1. num = 1 - sigma1_sq * constant den =1 when sigma1_sq<2 here constant=4/(255*255)
* 2. num = log2(((sigma2_sq+2)*sigma1_sq)/((sigma2_sq+2)*sigma1_sq-sigma12*sigma12) den=log2(1+(sigma1_sq/2)) else
*
* In fixed-point separate accumulator is used for non-log score accumulations and log-based score accumulation
* For non-log accumulator of numerator, only sigma1_sq * constant in fixed-point is accumulated
* log based values are separately accumulated.
* While adding both accumulator values the non-log accumulator is converted such that it is equivalent to 1 - sigma1_sq * constant(1's are accumulated with non-log denominator accumulator)
*/
//log has to be divided by 2048 as log_value = log2(i*2048) i=16384 to 65535
//num[0] = accum_num_log / 2048.0 + (accum_den_non_log - (accum_num_non_log / 65536.0) / (255.0*255.0));
//den[0] = accum_den_log / 2048.0 + accum_den_non_log;
//changed calculation to increase performance
num[0] = accum_num_log / 2048.0 + (accum_den_non_log - ((accum_num_non_log) / 16384.0) / (65025.0));
den[0] = accum_den_log / 2048.0 + accum_den_non_log;
}