void vif_statistic_16_avx512()

in libvmaf/src/feature/x86/vif_avx512.c [387:757]


void vif_statistic_16_avx512(struct VifPublicState *s, float *num, float *den, unsigned w, unsigned h, int bpc, int scale) {
    const unsigned fwidth = vif_filter1d_width[scale];
    const uint16_t *vif_filt = vif_filter1d_table[scale];
    VifBuffer buf = s->buf;
    const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
    int fwidth_half = fwidth >> 1;

    int32_t add_shift_round_VP, shift_VP;
    int32_t add_shift_round_VP_sq, shift_VP_sq;
    const uint16_t *log2_table = s->log2_table;
    double vif_enhn_gain_limit = s->vif_enhn_gain_limit;
    __m512i mask2 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);

    Residuals512 residuals;
    residuals.maccum_den_log = _mm512_setzero_si512();
    residuals.maccum_num_log = _mm512_setzero_si512();
    residuals.maccum_den_non_log = _mm512_setzero_si512();
    residuals.maccum_num_non_log = _mm512_setzero_si512();

    int64_t accum_num_log = 0;
    int64_t accum_den_log = 0;
    int64_t accum_num_non_log = 0;
    int64_t accum_den_non_log = 0;

    if (scale == 0)
    {
        shift_VP = bpc;
        add_shift_round_VP = 1 << (bpc - 1);
        shift_VP_sq = (bpc - 8) * 2;
        add_shift_round_VP_sq = (bpc == 8) ? 0 : 1 << (shift_VP_sq - 1);
    }
    else
    {
        shift_VP = 16;
        add_shift_round_VP = 32768;
        shift_VP_sq = 16;
        add_shift_round_VP_sq = 32768;
    }
    __m512i addnum64 = _mm512_set1_epi64(add_shift_round_VP_sq);
    __m512i addnum = _mm512_set1_epi32(add_shift_round_VP);
    uint16_t *ref = buf.ref;
    uint16_t *dis = buf.dis;

    for (unsigned i = 0; i < h; ++i)
    {
        //VERTICAL
        int ii = i - fwidth_half;
        int n = w >> 5;
        for (int j = 0; j < n << 5; j = j + 32)
        {

            __m512i mask3 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);   //first half of 512
            __m512i mask4 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4); //second half of 512
            int ii_check = ii;
            __m512i accumr_lo, accumr_hi, accumd_lo, accumd_hi, rmul1, rmul2,
                dmul1, dmul2, accumref1, accumref2, accumref3, accumref4,
                accumrefdis1, accumrefdis2, accumrefdis3, accumrefdis4,
                accumdis1, accumdis2, accumdis3, accumdis4;
            accumr_lo = accumr_hi = accumd_lo = accumd_hi = rmul1 = rmul2 = dmul1 = dmul2 = accumref1 = accumref2 = accumref3 = accumref4 = accumrefdis1 = accumrefdis2 = accumrefdis3 =
                accumrefdis4 = accumdis1 = accumdis2 = accumdis3 = accumdis4 = _mm512_setzero_si512();

            for (unsigned fi = 0; fi < fwidth; ++fi, ii_check = ii + fi)
            {

                const uint16_t fcoeff = vif_filt[fi];
                __m512i f1 = _mm512_set1_epi16(fcoeff);
                __m512i ref1 = _mm512_loadu_si512(
                    (__m512i*)(ref + (ii_check * stride) + j));
                __m512i dis1 = _mm512_loadu_si512(
                    (__m512i*)(dis + (ii_check * stride) + j));
                __m512i result2 = _mm512_mulhi_epu16(ref1, f1);
                __m512i result2lo = _mm512_mullo_epi16(ref1, f1);
                __m512i rmult1 = _mm512_unpacklo_epi16(result2lo, result2);
                __m512i rmult2 = _mm512_unpackhi_epi16(result2lo, result2);
                rmul1 = _mm512_permutex2var_epi64(rmult1, mask3, rmult2);
                rmul2 = _mm512_permutex2var_epi64(rmult1, mask4, rmult2);
                accumr_lo = _mm512_add_epi32(accumr_lo, rmul1);
                accumr_hi = _mm512_add_epi32(accumr_hi, rmul2);
                __m512i d0 = _mm512_mulhi_epu16(dis1, f1);
                __m512i d0lo = _mm512_mullo_epi16(dis1, f1);
                __m512i dmult1 = _mm512_unpacklo_epi16(d0lo, d0);
                __m512i dmult2 = _mm512_unpackhi_epi16(d0lo, d0);
                dmul1 = _mm512_permutex2var_epi64(dmult1, mask3, dmult2);
                dmul2 = _mm512_permutex2var_epi64(dmult1, mask4, dmult2);
                accumd_lo = _mm512_add_epi32(accumd_lo, dmul1);
                accumd_hi = _mm512_add_epi32(accumd_hi, dmul2);

                __m512i sg0 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(rmul1));
                __m512i sg1 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(rmul1, 1));
                __m512i sg2 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(rmul2));
                __m512i sg3 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(rmul2, 1));
                __m128i l0 = _mm512_castsi512_si128(ref1);
                __m128i l1 = _mm512_extracti32x4_epi32(ref1, 1);
                __m128i l2 = _mm512_extracti32x4_epi32(ref1, 2);
                __m128i l3 = _mm512_extracti32x4_epi32(ref1, 3);
                accumref1 = _mm512_add_epi64(accumref1,
                    _mm512_mul_epu32(sg0, _mm512_cvtepu16_epi64(l0)));
                accumref2 = _mm512_add_epi64(accumref2,
                    _mm512_mul_epu32(sg2, _mm512_cvtepu16_epi64(l2)));
                accumref3 = _mm512_add_epi64(accumref3,
                    _mm512_mul_epu32(sg1, _mm512_cvtepu16_epi64(l1)));
                accumref4 = _mm512_add_epi64(accumref4,
                    _mm512_mul_epu32(sg3, _mm512_cvtepu16_epi64(l3)));
                l0 = _mm512_castsi512_si128(dis1);
                l1 = _mm512_extracti32x4_epi32(dis1, 1);
                l2 = _mm512_extracti32x4_epi32(dis1, 2);
                l3 = _mm512_extracti32x4_epi32(dis1, 3);

                accumrefdis1 = _mm512_add_epi64(accumrefdis1,
                    _mm512_mul_epu32(sg0, _mm512_cvtepu16_epi64(l0)));
                accumrefdis2 = _mm512_add_epi64(accumrefdis2,
                    _mm512_mul_epu32(sg2, _mm512_cvtepu16_epi64(l2)));
                accumrefdis3 = _mm512_add_epi64(accumrefdis3,
                    _mm512_mul_epu32(sg1, _mm512_cvtepu16_epi64(l1)));
                accumrefdis4 = _mm512_add_epi64(accumrefdis4,
                    _mm512_mul_epu32(sg3, _mm512_cvtepu16_epi64(l3)));
                __m512i sd0 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(dmul1));
                __m512i sd1 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(dmul1, 1));
                __m512i sd2 = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(dmul2));
                __m512i sd3 = _mm512_cvtepu32_epi64(_mm512_extracti64x4_epi64(dmul2, 1));
                accumdis1 = _mm512_add_epi64(accumdis1,
                    _mm512_mul_epu32(sd0, _mm512_cvtepu16_epi64(l0)));
                accumdis2 = _mm512_add_epi64(accumdis2,
                    _mm512_mul_epu32(sd2, _mm512_cvtepu16_epi64(l2)));
                accumdis3 = _mm512_add_epi64(accumdis3,
                    _mm512_mul_epu32(sd1, _mm512_cvtepu16_epi64(l1)));
                accumdis4 = _mm512_add_epi64(accumdis4,
                    _mm512_mul_epu32(sd3, _mm512_cvtepu16_epi64(l3)));
            }
            accumr_lo = _mm512_add_epi32(accumr_lo, addnum);
            accumr_hi = _mm512_add_epi32(accumr_hi, addnum);
            accumr_lo = _mm512_srli_epi32(accumr_lo, shift_VP);
            accumr_hi = _mm512_srli_epi32(accumr_hi, shift_VP);
            _mm512_storeu_si512((__m512i*)(buf.tmp.mu1 + j), accumr_lo);
            _mm512_storeu_si512((__m512i*)(buf.tmp.mu1 + j + 16), accumr_hi);

            accumd_lo = _mm512_add_epi32(accumd_lo, addnum);
            accumd_hi = _mm512_add_epi32(accumd_hi, addnum);
            accumd_lo = _mm512_srli_epi32(accumd_lo, shift_VP);
            accumd_hi = _mm512_srli_epi32(accumd_hi, shift_VP);
            _mm512_storeu_si512((__m512i*)(buf.tmp.mu2 + j), accumd_lo);
            _mm512_storeu_si512((__m512i*)(buf.tmp.mu2 + j + 16), accumd_hi);

            accumref1 = _mm512_add_epi64(accumref1, addnum64);
            accumref2 = _mm512_add_epi64(accumref2, addnum64);
            accumref3 = _mm512_add_epi64(accumref3, addnum64);
            accumref4 = _mm512_add_epi64(accumref4, addnum64);
            accumref1 = _mm512_srli_epi64(accumref1, shift_VP_sq);
            accumref2 = _mm512_srli_epi64(accumref2, shift_VP_sq);
            accumref3 = _mm512_srli_epi64(accumref3, shift_VP_sq);
            accumref4 = _mm512_srli_epi64(accumref4, shift_VP_sq);

            _mm512_storeu_si512((__m512i*)(buf.tmp.ref + j),
                _mm512_permutex2var_epi32(accumref1, mask2, accumref3));
            _mm512_storeu_si512((__m512i*)(buf.tmp.ref + 16 + j),
                _mm512_permutex2var_epi32(accumref2, mask2, accumref4));

            accumrefdis1 = _mm512_add_epi64(accumrefdis1, addnum64);
            accumrefdis2 = _mm512_add_epi64(accumrefdis2, addnum64);
            accumrefdis3 = _mm512_add_epi64(accumrefdis3, addnum64);
            accumrefdis4 = _mm512_add_epi64(accumrefdis4, addnum64);
            accumrefdis1 = _mm512_srli_epi64(accumrefdis1, shift_VP_sq);
            accumrefdis2 = _mm512_srli_epi64(accumrefdis2, shift_VP_sq);
            accumrefdis3 = _mm512_srli_epi64(accumrefdis3, shift_VP_sq);
            accumrefdis4 = _mm512_srli_epi64(accumrefdis4, shift_VP_sq);

            _mm512_storeu_si512((__m512i*)(buf.tmp.ref_dis + j),
                _mm512_permutex2var_epi32(accumrefdis1, mask2, accumrefdis3));
            _mm512_storeu_si512((__m512i*)(buf.tmp.ref_dis + 16 + j),
                _mm512_permutex2var_epi32(accumrefdis2, mask2, accumrefdis4));

            accumdis1 = _mm512_add_epi64(accumdis1, addnum64);
            accumdis2 = _mm512_add_epi64(accumdis2, addnum64);
            accumdis3 = _mm512_add_epi64(accumdis3, addnum64);
            accumdis4 = _mm512_add_epi64(accumdis4, addnum64);
            accumdis1 = _mm512_srli_epi64(accumdis1, shift_VP_sq);
            accumdis2 = _mm512_srli_epi64(accumdis2, shift_VP_sq);
            accumdis3 = _mm512_srli_epi64(accumdis3, shift_VP_sq);
            accumdis4 = _mm512_srli_epi64(accumdis4, shift_VP_sq);

            _mm512_storeu_si512((__m512i*)(buf.tmp.dis + j),
                _mm512_permutex2var_epi32(accumdis1, mask2, accumdis3));
            _mm512_storeu_si512((__m512i*)(buf.tmp.dis + 16 + j),
                _mm512_permutex2var_epi32(accumdis2, mask2, accumdis4));
        }

        for (unsigned j = n << 5; j < w; ++j)
        {
            uint32_t accum_mu1 = 0;
            uint32_t accum_mu2 = 0;
            uint64_t accum_ref = 0;
            uint64_t accum_dis = 0;
            uint64_t accum_ref_dis = 0;
            for (unsigned fi = 0; fi < fwidth; ++fi)
            {
                int ii = i - fwidth / 2;
                int ii_check = ii + fi;
                const uint16_t fcoeff = vif_filt[fi];
                const ptrdiff_t stride = buf.stride / sizeof(uint16_t);
                uint16_t *ref = buf.ref;
                uint16_t *dis = buf.dis;
                uint16_t imgcoeff_ref = ref[ii_check * stride + j];
                uint16_t imgcoeff_dis = dis[ii_check * stride + j];
                uint32_t img_coeff_ref = fcoeff * (uint32_t)imgcoeff_ref;
                uint32_t img_coeff_dis = fcoeff * (uint32_t)imgcoeff_dis;
                accum_mu1 += img_coeff_ref;
                accum_mu2 += img_coeff_dis;
                accum_ref += img_coeff_ref * (uint64_t)imgcoeff_ref;
                accum_dis += img_coeff_dis * (uint64_t)imgcoeff_dis;
                accum_ref_dis += img_coeff_ref * (uint64_t)imgcoeff_dis;
            }
            buf.tmp.mu1[j] = (uint16_t)((accum_mu1 + add_shift_round_VP) >> shift_VP);
            buf.tmp.mu2[j] = (uint16_t)((accum_mu2 + add_shift_round_VP) >> shift_VP);
            buf.tmp.ref[j] = (uint32_t)((accum_ref + add_shift_round_VP_sq) >> shift_VP_sq);
            buf.tmp.dis[j] = (uint32_t)((accum_dis + add_shift_round_VP_sq) >> shift_VP_sq);
            buf.tmp.ref_dis[j] = (uint32_t)((accum_ref_dis + add_shift_round_VP_sq) >> shift_VP_sq);
        }

        PADDING_SQ_DATA(buf, w, fwidth_half);

        //HORIZONTAL
        n = w >> 4;
        for (int j = 0; j < n << 4; j = j + 16)
        {
            __m512i mu1sq;
            __m512i mu2sq;
            __m512i mu1mu2;
            __m512i xx;
            __m512i yy;
            __m512i xy;
            __m512i mask5 = _mm512_set_epi32(30, 28, 14, 12, 26, 24, 10, 8, 22, 20, 6, 4, 18, 16, 2, 0);
            // compute mu1sq, mu2sq, mu1mu2
            {
                __m512i fq = _mm512_set1_epi32(vif_filt[fwidth / 2]);
                __m512i acc0 = _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j + 0)), fq);
                __m512i acc1 = _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j + 0)), fq);

                for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
                    __m512i fq = _mm512_set1_epi32(vif_filt[fj]);
                    acc0 = _mm512_add_epi64(acc0, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j - fwidth / 2 + fj + 0)), fq));
                    acc0 = _mm512_add_epi64(acc0, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu1 + j + fwidth / 2 - fj + 0)), fq));
                    acc1 = _mm512_add_epi64(acc1, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j - fwidth / 2 + fj + 0)), fq));
                    acc1 = _mm512_add_epi64(acc1, _mm512_mullo_epi32(_mm512_loadu_si512((__m512i*)(buf.tmp.mu2 + j + fwidth / 2 - fj + 0)), fq));
                }
                __m512i mu1 = acc0;
                __m512i acc0_lo_512 = _mm512_unpacklo_epi32(acc0, _mm512_setzero_si512());
                __m512i acc0_hi_512 = _mm512_unpackhi_epi32(acc0, _mm512_setzero_si512());
                acc0_lo_512 = _mm512_mul_epu32(acc0_lo_512, acc0_lo_512);
                acc0_hi_512 = _mm512_mul_epu32(acc0_hi_512, acc0_hi_512);
                acc0_lo_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0_lo_512, _mm512_set1_epi64(0x80000000)), 32);
                acc0_hi_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0_hi_512, _mm512_set1_epi64(0x80000000)), 32);
                mu1sq = _mm512_permutex2var_epi32(acc0_lo_512, mask5, acc0_hi_512);

                __m512i acc0lo_512 = _mm512_unpacklo_epi32(acc1, _mm512_setzero_si512());
                __m512i acc0hi_512 = _mm512_unpackhi_epi32(acc1, _mm512_setzero_si512());
                __m512i mu1lo_512 = _mm512_unpacklo_epi32(mu1, _mm512_setzero_si512());
                __m512i mu1hi_512 = _mm512_unpackhi_epi32(mu1, _mm512_setzero_si512());

                mu1lo_512 = _mm512_mul_epu32(mu1lo_512, acc0lo_512);
                mu1hi_512 = _mm512_mul_epu32(mu1hi_512, acc0hi_512);
                mu1lo_512 = _mm512_srli_epi64(_mm512_add_epi64(mu1lo_512, _mm512_set1_epi64(0x80000000)), 32);
                mu1hi_512 = _mm512_srli_epi64(_mm512_add_epi64(mu1hi_512, _mm512_set1_epi64(0x80000000)), 32);

                mu1mu2 = _mm512_permutex2var_epi32(mu1lo_512, mask5, mu1hi_512);
                acc0lo_512 = _mm512_mul_epu32(acc0lo_512, acc0lo_512);
                acc0hi_512 = _mm512_mul_epu32(acc0hi_512, acc0hi_512);
                acc0lo_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0lo_512, _mm512_set1_epi64(0x80000000)), 32);
                acc0hi_512 = _mm512_srli_epi64(_mm512_add_epi64(acc0hi_512, _mm512_set1_epi64(0x80000000)), 32);
                mu2sq = _mm512_permutex2var_epi32(acc0lo_512, mask5, acc0hi_512);
            }

            // compute xx, yy, xy
            {
                __m512i rounder = _mm512_set1_epi64(0x8000);
                __m512i fq = _mm512_set1_epi64(vif_filt[fwidth / 2]);
                __m512i s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 0))); // 4
                __m512i s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + 8))); // 4
                __m512i refsq_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
                __m512i refsq_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));

                s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 0))); // 4
                s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + 8))); // 4
                __m512i dissq_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
                __m512i dissq_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));

                s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 0))); // 4
                s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + 8))); // 4
                __m512i refdis_lo = _mm512_add_epi64(rounder, _mm512_mul_epu32(s0, fq));
                __m512i refdis_hi = _mm512_add_epi64(rounder, _mm512_mul_epu32(s2, fq));

                for (unsigned fj = 0; fj < fwidth / 2; ++fj) {
                    __m512i fq = _mm512_set1_epi64(vif_filt[fj]);
                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j - fwidth / 2 + fj + 8))); // 4
                    refsq_lo = _mm512_add_epi64(refsq_lo, _mm512_mul_epu32(s0, fq));
                    refsq_hi = _mm512_add_epi64(refsq_hi, _mm512_mul_epu32(s2, fq));
                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref + j + fwidth / 2 - fj + 8))); // 4
                    refsq_lo = _mm512_add_epi64(refsq_lo, _mm512_mul_epu32(s0, fq));
                    refsq_hi = _mm512_add_epi64(refsq_hi, _mm512_mul_epu32(s2, fq));

                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j - fwidth / 2 + fj + 8))); // 4
                    dissq_lo = _mm512_add_epi64(dissq_lo, _mm512_mul_epu32(s0, fq));
                    dissq_hi = _mm512_add_epi64(dissq_hi, _mm512_mul_epu32(s2, fq));
                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.dis + j + fwidth / 2 - fj + 8))); // 4
                    dissq_lo = _mm512_add_epi64(dissq_lo, _mm512_mul_epu32(s0, fq));
                    dissq_hi = _mm512_add_epi64(dissq_hi, _mm512_mul_epu32(s2, fq));

                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j - fwidth / 2 + fj + 8))); // 4
                    refdis_lo = _mm512_add_epi64(refdis_lo, _mm512_mul_epu32(s0, fq));
                    refdis_hi = _mm512_add_epi64(refdis_hi, _mm512_mul_epu32(s2, fq));
                    s0 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 0))); // 4
                    s2 = _mm512_cvtepu32_epi64(_mm256_loadu_si256((__m256i*)(buf.tmp.ref_dis + j + fwidth / 2 - fj + 8))); // 4
                    refdis_lo = _mm512_add_epi64(refdis_lo, _mm512_mul_epu32(s0, fq));
                    refdis_hi = _mm512_add_epi64(refdis_hi, _mm512_mul_epu32(s2, fq));
                }
                refsq_lo = _mm512_srli_epi64(refsq_lo, 16);
                refsq_hi = _mm512_srli_epi64(refsq_hi, 16);
                __m512i refsq = _mm512_permutex2var_epi32(refsq_lo, mask2, refsq_hi);
                xx = _mm512_sub_epi32(refsq, mu1sq);

                dissq_lo = _mm512_srli_epi64(dissq_lo, 16);
                dissq_hi = _mm512_srli_epi64(dissq_hi, 16);
                __m512i dissq = _mm512_permutex2var_epi32(dissq_lo, mask2, dissq_hi);
                yy = _mm512_max_epi32(_mm512_sub_epi32(dissq, mu2sq), _mm512_setzero_si512());

                refdis_lo = _mm512_srli_epi64(refdis_lo, 16);
                refdis_hi = _mm512_srli_epi64(refdis_hi, 16);
                __m512i refdis = _mm512_permutex2var_epi32(refdis_lo, mask2, refdis_hi);
                xy = _mm512_sub_epi32(refdis, mu1mu2);
            }
            vif_statistic_avx512(&residuals, xx, xy, yy, log2_table, vif_enhn_gain_limit);
        }

        if ((n << 4) != (int)w) {
            VifResiduals residuals =
                vif_compute_line_residuals(s, n << 4, w, scale);
            accum_num_log += residuals.accum_num_log;
            accum_den_log += residuals.accum_den_log;
            accum_num_non_log += residuals.accum_num_non_log;
            accum_den_non_log += residuals.accum_den_non_log;
        }
    }

    accum_num_log += _mm512_reduce_add_epi64(residuals.maccum_num_log);
    accum_den_log += _mm512_reduce_add_epi64(residuals.maccum_den_log);
    accum_num_non_log += _mm512_reduce_add_epi64(residuals.maccum_num_non_log);
    accum_den_non_log += _mm512_reduce_add_epi64(residuals.maccum_den_non_log);


    /**
        * In floating-point there are two types of numerator scores and denominator scores
        * 1. num = 1 - sigma1_sq * constant den =1  when sigma1_sq<2  here constant=4/(255*255)
        * 2. num = log2(((sigma2_sq+2)*sigma1_sq)/((sigma2_sq+2)*sigma1_sq-sigma12*sigma12) den=log2(1+(sigma1_sq/2)) else
        *
        * In fixed-point separate accumulator is used for non-log score accumulations and log-based score accumulation
        * For non-log accumulator of numerator, only sigma1_sq * constant in fixed-point is accumulated
        * log based values are separately accumulated.
        * While adding both accumulator values the non-log accumulator is converted such that it is equivalent to 1 - sigma1_sq * constant(1's are accumulated with non-log denominator accumulator)
    */
    //log has to be divided by 2048 as log_value = log2(i*2048)  i=16384 to 65535
    //num[0] = accum_num_log / 2048.0 + (accum_den_non_log - (accum_num_non_log / 65536.0) / (255.0*255.0));
    //den[0] = accum_den_log / 2048.0 + accum_den_non_log;

    //changed calculation to increase performance
    num[0] = accum_num_log / 2048.0 + (accum_den_non_log - ((accum_num_non_log) / 16384.0) / (65025.0));
    den[0] = accum_den_log / 2048.0 + accum_den_non_log;
}