void vif_subsample_rd_8_avx512()

in libvmaf/src/feature/x86/vif_avx512.c [759:1103]


void vif_subsample_rd_8_avx512(VifBuffer buf, unsigned w, unsigned h)
{
    const unsigned fwidth = vif_filter1d_width[1];
    const uint16_t *vif_filt_s1 = vif_filter1d_table[1];
    const uint8_t *ref = (uint8_t *)buf.ref;
    const uint8_t *dis = (uint8_t *)buf.dis;
    const ptrdiff_t stride = buf.stride_16 / sizeof(uint16_t);
    __m512i addnum = _mm512_set1_epi32(32768);

    // __m512i mask1 = _mm512_set_epi16(60, 56, 28, 24, 52, 48, 20, 16, 44,
    //                                  40, 12, 8, 36, 32, 4, 0, 60, 56, 28, 24,
    //                                  52, 48, 20, 16, 44, 40, 12, 8, 36, 32, 4, 0);
    const int M = 1 << 16;
    __m512i mask1 = _mm512_set_epi32(60 * M + 56, 28 * M + 24, 52 * M + 48, 20 * M + 16,
                                     44 * M + 40, 12 * M +  8, 36 * M + 32,  4 * M +  0,
                                     60 * M + 56, 28 * M + 24, 52 * M + 48, 20 * M + 16,
                                     44 * M + 40, 12 * M +  8, 36 * M + 32,  4 * M +  0);

    __m512i x = _mm512_set1_epi32(128);
    __m512i mask2 = _mm512_set_epi64(11, 10, 3, 2, 9, 8, 1, 0);
    __m512i mask3 = _mm512_set_epi64(15, 14, 7, 6, 13, 12, 5, 4);
    int fwidth_half = fwidth >> 1;
    __m512i f0, f1, f2, f3, f4;

    f0 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)vif_filt_s1));
    f1 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 2)));
    f2 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 4)));
    f3 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 6)));
    f4 = _mm512_broadcastd_epi32(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 8)));

    __m512i fcoeff = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)vif_filt_s1));
    __m512i fcoeff1 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 1)));
    __m512i fcoeff2 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 2)));
    __m512i fcoeff3 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 3)));
    __m512i fcoeff4 = _mm512_broadcastw_epi16(_mm_loadu_si128((__m128i *)(vif_filt_s1 + 4)));

    for (unsigned i = 0; i < h; ++i)
    {
        //VERTICAL
        int n = w >> 5;
        int ii = i - fwidth_half;
        for (int j = 0; j < n << 5; j = j + 32)
        {

            int ii_check = ii;
            __m512i accum_mu2_lo, accum_mu1_lo, accum_mu2_hi, accum_mu1_hi;
            accum_mu2_lo = accum_mu2_hi = accum_mu1_lo = accum_mu1_hi = _mm512_setzero_si512();

            {
                __m512i g0, g1, g2, g3, g4, g5, g6, g7, g8, g9;
                __m512i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;

                g0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + (buf.stride * ii_check) + j)));
                g1 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check) + buf.stride + j)));
                g2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 2) + j)));
                g3 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 3) + j)));
                g4 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 4) + j)));
                g5 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 5) + j)));
                g6 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 6) + j)));
                g7 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 7) + j)));
                g8 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 8) + j)));
                g9 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(ref + buf.stride * (ii_check + 9) + j)));

                s0 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + (buf.stride * ii_check) + j)));
                s1 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 1) + j)));
                s2 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 2) + j)));
                s3 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 3) + j)));
                s4 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 4) + j)));
                s5 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 5) + j)));
                s6 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 6) + j)));
                s7 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 7) + j)));
                s8 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 8) + j)));
                s9 = _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i *)(dis + buf.stride * (ii_check + 9) + j)));

                __m512i s0lo = _mm512_unpacklo_epi16(s0, s1);
                __m512i s0hi = _mm512_unpackhi_epi16(s0, s1);
                accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
                                                _mm512_madd_epi16(s0lo, f0));
                accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
                                                _mm512_madd_epi16(s0hi, f0));
                __m512i s1lo = _mm512_unpacklo_epi16(s2, s3);
                __m512i s1hi = _mm512_unpackhi_epi16(s2, s3);
                accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
                                                _mm512_madd_epi16(s1lo, f1));
                accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
                                                _mm512_madd_epi16(s1hi, f1));
                __m512i s2lo = _mm512_unpacklo_epi16(s4, s5);
                __m512i s2hi = _mm512_unpackhi_epi16(s4, s5);
                accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
                                                _mm512_madd_epi16(s2lo, f2));
                accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
                                                _mm512_madd_epi16(s2hi, f2));
                __m512i s3lo = _mm512_unpacklo_epi16(s6, s7);
                __m512i s3hi = _mm512_unpackhi_epi16(s6, s7);
                accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
                                                _mm512_madd_epi16(s3lo, f3));
                accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
                                                _mm512_madd_epi16(s3hi, f3));
                __m512i s4lo = _mm512_unpacklo_epi16(s8, s9);
                __m512i s4hi = _mm512_unpackhi_epi16(s8, s9);
                accum_mu2_lo = _mm512_add_epi32(accum_mu2_lo,
                                                _mm512_madd_epi16(s4lo, f4));
                accum_mu2_hi = _mm512_add_epi32(accum_mu2_hi,
                                                _mm512_madd_epi16(s4hi, f4));

                __m512i g0lo = _mm512_unpacklo_epi16(g0, g1);
                __m512i g0hi = _mm512_unpackhi_epi16(g0, g1);
                accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
                                                _mm512_madd_epi16(g0lo, f0));
                accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
                                                _mm512_madd_epi16(g0hi, f0));
                __m512i g1lo = _mm512_unpacklo_epi16(g2, g3);
                __m512i g1hi = _mm512_unpackhi_epi16(g2, g3);
                accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
                                                _mm512_madd_epi16(g1lo, f1));
                accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
                                                _mm512_madd_epi16(g1hi, f1));
                __m512i g2lo = _mm512_unpacklo_epi16(g4, g5);
                __m512i g2hi = _mm512_unpackhi_epi16(g4, g5);
                accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
                                                _mm512_madd_epi16(g2lo, f2));
                accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
                                                _mm512_madd_epi16(g2hi, f2));
                __m512i g3lo = _mm512_unpacklo_epi16(g6, g7);
                __m512i g3hi = _mm512_unpackhi_epi16(g6, g7);
                accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
                                                _mm512_madd_epi16(g3lo, f3));
                accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
                                                _mm512_madd_epi16(g3hi, f3));
                __m512i g4lo = _mm512_unpacklo_epi16(g8, g9);
                __m512i g4hi = _mm512_unpackhi_epi16(g8, g9);
                accum_mu1_lo = _mm512_add_epi32(accum_mu1_lo,
                                                _mm512_madd_epi16(g4lo, f4));
                accum_mu1_hi = _mm512_add_epi32(accum_mu1_hi,
                                                _mm512_madd_epi16(g4hi, f4));
            }
            __m512i accumu1_lo = _mm512_add_epi32(x,
                                                  _mm512_permutex2var_epi64(accum_mu1_lo, mask2, accum_mu1_hi));
            __m512i accumu1_hi = _mm512_add_epi32(x,
                                                  _mm512_permutex2var_epi64(accum_mu1_lo, mask3, accum_mu1_hi));
            __m512i accumu2_lo = _mm512_add_epi32(x,
                                                  _mm512_permutex2var_epi64(accum_mu2_lo, mask2, accum_mu2_hi));
            __m512i accumu2_hi = _mm512_add_epi32(x,
                                                  _mm512_permutex2var_epi64(accum_mu2_lo, mask3, accum_mu2_hi));
            accumu1_lo = _mm512_srli_epi32(accumu1_lo, 0x08);
            accumu1_hi = _mm512_srli_epi32(accumu1_hi, 0x08);
            accumu2_lo = _mm512_srli_epi32(accumu2_lo, 0x08);
            accumu2_hi = _mm512_srli_epi32(accumu2_hi, 0x08);
            _mm512_storeu_si512((__m512i *)(buf.tmp.ref_convol + j), accumu1_lo);
            _mm512_storeu_si512((__m512i *)(buf.tmp.ref_convol + j + 16), accumu1_hi);
            _mm512_storeu_si512((__m512i *)(buf.tmp.dis_convol + j), accumu2_lo);
            _mm512_storeu_si512((__m512i *)(buf.tmp.dis_convol + j + 16), accumu2_hi);
        }
        for (unsigned j = n << 5; j < w; ++j)
        {
            uint32_t accum_ref = 0;
            uint32_t accum_dis = 0;
            for (unsigned fi = 0; fi < fwidth; ++fi)
            {
                int ii = i - fwidth_half;
                int ii_check = ii + fi;
                const uint16_t fcoeff = vif_filt_s1[fi];
                const uint8_t *ref = (uint8_t *)buf.ref;
                const uint8_t *dis = (uint8_t *)buf.dis;
                accum_ref += fcoeff * (uint32_t)ref[ii_check * buf.stride + j];
                accum_dis += fcoeff * (uint32_t)dis[ii_check * buf.stride + j];
            }
            buf.tmp.ref_convol[j] = (accum_ref + 128) >> 8;
            buf.tmp.dis_convol[j] = (accum_dis + 128) >> 8;
        }

        PADDING_SQ_DATA_2(buf, w, fwidth_half);

        //HORIZONTAL
        n = w >> 4;
        for (int j = 0; j < n << 4; j = j + 16)
        {
            int jj = j - fwidth_half;
            int jj_check = jj;
            __m512i accumrlo, accumdlo, accumrhi, accumdhi, padzero;
            accumrlo = accumdlo = accumrhi = accumdhi = padzero = _mm512_setzero_si512();
            {

                __m512i refconvol = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check));
                __m512i refconvol1 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 1));
                __m512i refconvol2 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 2));
                __m512i refconvol3 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 3));
                __m512i refconvol4 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 4));
                __m512i refconvol5 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 5));
                __m512i refconvol6 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 6));
                __m512i refconvol7 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 7));
                __m512i refconvol8 = _mm512_loadu_si512((__m512i *)(buf.tmp.ref_convol + jj_check + 8));

                __m512i result2 = _mm512_mulhi_epu16(refconvol, fcoeff);
                __m512i result2lo = _mm512_mullo_epi16(refconvol, fcoeff);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result2lo, result2));
                accumrhi = _mm512_add_epi32(accumrhi, _mm512_unpackhi_epi16(result2lo, result2));
                __m512i result3 = _mm512_mulhi_epu16(refconvol1, fcoeff1);
                __m512i result3lo = _mm512_mullo_epi16(refconvol1, fcoeff1);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result3lo, result3));
                accumrhi = _mm512_add_epi32(accumrhi, _mm512_unpackhi_epi16(result3lo, result3));
                __m512i result4 = _mm512_mulhi_epu16(refconvol2, fcoeff2);
                __m512i result4lo = _mm512_mullo_epi16(refconvol2, fcoeff2);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result4lo, result4));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result4lo, result4));
                __m512i result5 = _mm512_mulhi_epu16(refconvol3, fcoeff3);
                __m512i result5lo = _mm512_mullo_epi16(refconvol3, fcoeff3);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result5lo, result5));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result5lo, result5));
                __m512i result6 = _mm512_mulhi_epu16(refconvol4, fcoeff4);
                __m512i result6lo = _mm512_mullo_epi16(refconvol4, fcoeff4);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result6lo, result6));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result6lo, result6));
                __m512i result7 = _mm512_mulhi_epu16(refconvol5, fcoeff3);
                __m512i result7lo = _mm512_mullo_epi16(refconvol5, fcoeff3);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result7lo, result7));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result7lo, result7));
                __m512i result8 = _mm512_mulhi_epu16(refconvol6, fcoeff2);
                __m512i result8lo = _mm512_mullo_epi16(refconvol6, fcoeff2);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result8lo, result8));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result8lo, result8));
                __m512i result9 = _mm512_mulhi_epu16(refconvol7, fcoeff1);
                __m512i result9lo = _mm512_mullo_epi16(refconvol7, fcoeff1);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result9lo, result9));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result9lo, result9));
                __m512i result10 = _mm512_mulhi_epu16(refconvol8, fcoeff);
                __m512i result10lo = _mm512_mullo_epi16(refconvol8, fcoeff);
                accumrlo = _mm512_add_epi32(
                    accumrlo, _mm512_unpacklo_epi16(result10lo, result10));
                accumrhi = _mm512_add_epi32(
                    accumrhi, _mm512_unpackhi_epi16(result10lo, result10));

                __m512i disconvol = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check));
                __m512i disconvol1 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 1));
                __m512i disconvol2 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 2));
                __m512i disconvol3 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 3));
                __m512i disconvol4 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 4));
                __m512i disconvol5 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 5));
                __m512i disconvol6 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 6));
                __m512i disconvol7 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 7));
                __m512i disconvol8 = _mm512_loadu_si512((__m512i *)(buf.tmp.dis_convol + jj_check + 8));
                result2 = _mm512_mulhi_epu16(disconvol, fcoeff);
                result2lo = _mm512_mullo_epi16(disconvol, fcoeff);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result2lo, result2));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result2lo, result2));
                result3 = _mm512_mulhi_epu16(disconvol1, fcoeff1);
                result3lo = _mm512_mullo_epi16(disconvol1, fcoeff1);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result3lo, result3));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result3lo, result3));
                result4 = _mm512_mulhi_epu16(disconvol2, fcoeff2);
                result4lo = _mm512_mullo_epi16(disconvol2, fcoeff2);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result4lo, result4));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result4lo, result4));
                result5 = _mm512_mulhi_epu16(disconvol3, fcoeff3);
                result5lo = _mm512_mullo_epi16(disconvol3, fcoeff3);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result5lo, result5));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result5lo, result5));
                result6 = _mm512_mulhi_epu16(disconvol4, fcoeff4);
                result6lo = _mm512_mullo_epi16(disconvol4, fcoeff4);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result6lo, result6));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result6lo, result6));
                result7 = _mm512_mulhi_epu16(disconvol5, fcoeff3);
                result7lo = _mm512_mullo_epi16(disconvol5, fcoeff3);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result7lo, result7));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result7lo, result7));
                result8 = _mm512_mulhi_epu16(disconvol6, fcoeff2);
                result8lo = _mm512_mullo_epi16(disconvol6, fcoeff2);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result8lo, result8));
                accumdhi = _mm512_add_epi32(accumdhi, _mm512_unpackhi_epi16(result8lo, result8));
                result9 = _mm512_mulhi_epu16(
                    disconvol7, fcoeff1);
                result9lo = _mm512_mullo_epi16(disconvol7, fcoeff1);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result9lo, result9));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result9lo, result9));
                result10 = _mm512_mulhi_epu16(disconvol8, fcoeff);
                result10lo = _mm512_mullo_epi16(disconvol8, fcoeff);
                accumdlo = _mm512_add_epi32(
                    accumdlo, _mm512_unpacklo_epi16(result10lo, result10));
                accumdhi = _mm512_add_epi32(
                    accumdhi, _mm512_unpackhi_epi16(result10lo, result10));
            }

            accumdlo = _mm512_add_epi32(accumdlo, addnum);
            accumdhi = _mm512_add_epi32(accumdhi, addnum);
            accumrlo = _mm512_add_epi32(accumrlo, addnum);
            accumrhi = _mm512_add_epi32(accumrhi, addnum);
            accumdlo = _mm512_srli_epi32(accumdlo, 0x10);
            accumdhi = _mm512_srli_epi32(accumdhi, 0x10);
            accumrlo = _mm512_srli_epi32(accumrlo, 0x10);
            accumrhi = _mm512_srli_epi32(accumrhi, 0x10);

            __m512i result = _mm512_permutex2var_epi16(accumdlo, mask1, accumdhi);
            __m512i resultd = _mm512_permutex2var_epi16(accumrlo, mask1, accumrhi);

            _mm256_storeu_si256((__m256i *)(buf.mu1 + i * stride + j), _mm512_castsi512_si256(resultd));
            _mm256_storeu_si256((__m256i *)(buf.mu2 + i * stride + j), _mm512_castsi512_si256(result));
        }

        for (unsigned j = n << 4; j < w; ++j)
        {
            uint32_t accum_ref = 0;
            uint32_t accum_dis = 0;
            int jj = j - fwidth_half;
            int jj_check = jj;
            for (unsigned fj = 0; fj < fwidth; ++fj, jj_check = jj + fj)
            {
                const uint16_t fcoeff = vif_filt_s1[fj];
                accum_ref += fcoeff * buf.tmp.ref_convol[jj_check];
                accum_dis += fcoeff * buf.tmp.dis_convol[jj_check];
            }
            buf.mu1[i * stride + j] = (uint16_t)((accum_ref + 32768) >> 16);
            buf.mu2[i * stride + j] = (uint16_t)((accum_dis + 32768) >> 16);
        }
    }
    decimate_and_pad(buf, w, h, 0);
}