void vif_subsample_rd_16_neon()

in libvmaf/src/feature/arm64/vif_neon.c [373:505]


void vif_subsample_rd_16_neon(VifBuffer buf, unsigned int w, unsigned int h, int scale, int bpc)
{
    const unsigned int uiw15 = (w > 15 ? w - 15 : 0);
    const unsigned int fwidth = vif_filter1d_width[scale + 1];
    const uint16_t *vif_filt_s = vif_filter1d_table[scale + 1];
    int32_t add_shift_round_VP, shift_VP;

    if (scale == 0)
    {
        add_shift_round_VP = 1 << (bpc - 1);
        shift_VP = bpc;
    }
    else
    {
        add_shift_round_VP = 32768;
        shift_VP = 16;
    }

    const uint32x4_t add_shift_round_VP_vec = vdupq_n_u32(add_shift_round_VP);
    const int32x4_t shift_VP_vec = vdupq_n_s32(-shift_VP);
    const uint32x4_t offset_vec_h = vdupq_n_u32(32768);
    int32x4_t shift_vec_h = vdupq_n_s32(-16);

    const uint16_t *ref = (uint16_t *)buf.ref;
    const uint16_t *dis = (uint16_t *)buf.dis;

    const ptrdiff_t stride_v = buf.stride / sizeof(uint16_t);
    const ptrdiff_t stride_h = buf.stride_16 / sizeof(uint16_t);
    ptrdiff_t i_dst_stride = 0;

    for (unsigned i = 0; i < h; ++i, i_dst_stride += stride_h)
    {

        int ii = i - fwidth / 2;
        const uint16_t *p_ref = ref + ii * stride_v;
        const uint16_t *p_dis = dis + ii * stride_v;

        // VERTICAL Neon
        unsigned int j = 0;
        for (; j < uiw15; j += 16, p_ref += 16, p_dis += 16)
        {
            uint16x8_t ref_vec_16u_l = vld1q_u16(p_ref);
            uint16x8_t ref_vec_16u_h = vld1q_u16(p_ref + 8);
            uint16x8_t dis_vec_16u_l = vld1q_u16(p_dis);
            uint16x8_t dis_vec_16u_h = vld1q_u16(p_dis + 8);

            NEON_FILTER_INSTANCE_U32X4_INIT_MULL_U16X4_WITH_CONST_LO_HI_LH(accum_f_ref, add_shift_round_VP_vec, ref_vec_16u, vif_filt_s[0]);
            NEON_FILTER_INSTANCE_U32X4_INIT_MULL_U16X4_WITH_CONST_LO_HI_LH(accum_f_dis, add_shift_round_VP_vec, dis_vec_16u, vif_filt_s[0]);

            const uint16_t *pp_ref = p_ref + stride_v;
            const uint16_t *pp_dis = p_dis + stride_v;
            for (unsigned fi = 1; fi < fwidth; ++fi, pp_ref += stride_v, pp_dis += stride_v)
            {
                ref_vec_16u_l = vld1q_u16(pp_ref);
                ref_vec_16u_h = vld1q_u16(pp_ref + 8);
                dis_vec_16u_l = vld1q_u16(pp_dis);
                dis_vec_16u_h = vld1q_u16(pp_dis + 8);

                NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LH(accum_f_ref, ref_vec_16u, vif_filt_s[fi]);
                NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LH(accum_f_dis, dis_vec_16u, vif_filt_s[fi]);
            }

            NEON_FILTER_SHIFT_STORE_U32X4_HI_LO_LH(accum_f_ref, shift_VP_vec, buf.tmp.ref_convol + j);
            NEON_FILTER_SHIFT_STORE_U32X4_HI_LO_LH(accum_f_dis, shift_VP_vec, buf.tmp.dis_convol + j);
        }

        // Scalar code for Vertical leftover.
        for (; j < w; ++j)
        {
            uint32_t accum_ref = 0;
            uint32_t accum_dis = 0;
            for (unsigned fi = 0; fi < fwidth; ++fi)
            {
                int ii = i - fwidth / 2;
                int ii_check = ii + fi;
                const uint16_t fcoeff = vif_filt_s[fi];
                uint16_t *ref = (uint16_t *)buf.ref;
                uint16_t *dis = (uint16_t *)buf.dis;
                accum_ref += fcoeff * ((uint32_t)ref[ii_check * stride_v + j]);
                accum_dis += fcoeff * ((uint32_t)dis[ii_check * stride_v + j]);
            }
            buf.tmp.ref_convol[j] = (uint16_t)((accum_ref + add_shift_round_VP) >> shift_VP);
            buf.tmp.dis_convol[j] = (uint16_t)((accum_dis + add_shift_round_VP) >> shift_VP);
        }

        PADDING_SQ_DATA_2(buf, w, fwidth / 2);

        // HORIZONTAL
        uint32_t *pRefConv = (uint32_t *)buf.tmp.ref_convol - (fwidth / 2);
        uint32_t *pDisConv = (uint32_t *)buf.tmp.dis_convol - (fwidth / 2);

        j = 0;
        for (; j < uiw15; j += 16, pRefConv += 16, pDisConv += 16)
        {
            NEON_FILTER_INSTANCE_AND_LOAD_U32X4_LU4(ref_conv_vec_u32, pRefConv);
            NEON_FILTER_INSTANCE_AND_LOAD_U32X4_LU4(dis_conv_vec_u32, pDisConv);

            NEON_FILTER_INSTANCE_U32X4_MULL_U32X4_WITH_CONST_LU4(accum_ref_conv, offset_vec_h, ref_conv_vec_u32, vif_filt_s[0]);
            NEON_FILTER_INSTANCE_U32X4_MULL_U32X4_WITH_CONST_LU4(accum_dis_conv, offset_vec_h, dis_conv_vec_u32, vif_filt_s[0]);

            for (unsigned fj = 1; fj < fwidth; ++fj)
            {
                NEON_FILTER_LOAD_U32X4_LU4(ref_conv_vec_u32, pRefConv + fj);
                NEON_FILTER_LOAD_U32X4_LU4(dis_conv_vec_u32, pDisConv + fj);

                NEON_FILTER_UPDATE_U32X4_ACCUM_MULL_U32X4_WITH_CONST_LU4(accum_ref_conv, ref_conv_vec_u32, vif_filt_s[fj]);
                NEON_FILTER_UPDATE_U32X4_ACCUM_MULL_U32X4_WITH_CONST_LU4(accum_dis_conv, dis_conv_vec_u32, vif_filt_s[fj]);
            }

            NEON_FILTER_SHIFT_UNZIP_STORE_U32X4_TO_U16X8_LU2(accum_ref_conv, shift_vec_h, buf.mu1 + i_dst_stride + j);
            NEON_FILTER_SHIFT_UNZIP_STORE_U32X4_TO_U16X8_LU2(accum_dis_conv, shift_vec_h, buf.mu2 + i_dst_stride + j);
        }

        // Scalar code for Horizontal leftover.
        for (; j < w; ++j)
        {
            uint32_t accum_ref = 32768;
            uint32_t accum_dis = 32768;
            for (unsigned fj = 0; fj < fwidth; ++fj)
            {
                int jj = j - fwidth / 2;
                int jj_check = jj + fj;
                const uint16_t fcoeff = vif_filt_s[fj];
                accum_ref += fcoeff * buf.tmp.ref_convol[jj_check];
                accum_dis += fcoeff * buf.tmp.dis_convol[jj_check];
            }
            buf.mu1[i_dst_stride + j] = (uint16_t)(accum_ref >> 16);
            buf.mu2[i_dst_stride + j] = (uint16_t)(accum_dis >> 16);
        }
    }

    decimate_and_pad(buf, w, h, scale);
}