in libvmaf/src/feature/arm64/vif_neon.c [373:505]
void vif_subsample_rd_16_neon(VifBuffer buf, unsigned int w, unsigned int h, int scale, int bpc)
{
const unsigned int uiw15 = (w > 15 ? w - 15 : 0);
const unsigned int fwidth = vif_filter1d_width[scale + 1];
const uint16_t *vif_filt_s = vif_filter1d_table[scale + 1];
int32_t add_shift_round_VP, shift_VP;
if (scale == 0)
{
add_shift_round_VP = 1 << (bpc - 1);
shift_VP = bpc;
}
else
{
add_shift_round_VP = 32768;
shift_VP = 16;
}
const uint32x4_t add_shift_round_VP_vec = vdupq_n_u32(add_shift_round_VP);
const int32x4_t shift_VP_vec = vdupq_n_s32(-shift_VP);
const uint32x4_t offset_vec_h = vdupq_n_u32(32768);
int32x4_t shift_vec_h = vdupq_n_s32(-16);
const uint16_t *ref = (uint16_t *)buf.ref;
const uint16_t *dis = (uint16_t *)buf.dis;
const ptrdiff_t stride_v = buf.stride / sizeof(uint16_t);
const ptrdiff_t stride_h = buf.stride_16 / sizeof(uint16_t);
ptrdiff_t i_dst_stride = 0;
for (unsigned i = 0; i < h; ++i, i_dst_stride += stride_h)
{
int ii = i - fwidth / 2;
const uint16_t *p_ref = ref + ii * stride_v;
const uint16_t *p_dis = dis + ii * stride_v;
// VERTICAL Neon
unsigned int j = 0;
for (; j < uiw15; j += 16, p_ref += 16, p_dis += 16)
{
uint16x8_t ref_vec_16u_l = vld1q_u16(p_ref);
uint16x8_t ref_vec_16u_h = vld1q_u16(p_ref + 8);
uint16x8_t dis_vec_16u_l = vld1q_u16(p_dis);
uint16x8_t dis_vec_16u_h = vld1q_u16(p_dis + 8);
NEON_FILTER_INSTANCE_U32X4_INIT_MULL_U16X4_WITH_CONST_LO_HI_LH(accum_f_ref, add_shift_round_VP_vec, ref_vec_16u, vif_filt_s[0]);
NEON_FILTER_INSTANCE_U32X4_INIT_MULL_U16X4_WITH_CONST_LO_HI_LH(accum_f_dis, add_shift_round_VP_vec, dis_vec_16u, vif_filt_s[0]);
const uint16_t *pp_ref = p_ref + stride_v;
const uint16_t *pp_dis = p_dis + stride_v;
for (unsigned fi = 1; fi < fwidth; ++fi, pp_ref += stride_v, pp_dis += stride_v)
{
ref_vec_16u_l = vld1q_u16(pp_ref);
ref_vec_16u_h = vld1q_u16(pp_ref + 8);
dis_vec_16u_l = vld1q_u16(pp_dis);
dis_vec_16u_h = vld1q_u16(pp_dis + 8);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LH(accum_f_ref, ref_vec_16u, vif_filt_s[fi]);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LH(accum_f_dis, dis_vec_16u, vif_filt_s[fi]);
}
NEON_FILTER_SHIFT_STORE_U32X4_HI_LO_LH(accum_f_ref, shift_VP_vec, buf.tmp.ref_convol + j);
NEON_FILTER_SHIFT_STORE_U32X4_HI_LO_LH(accum_f_dis, shift_VP_vec, buf.tmp.dis_convol + j);
}
// Scalar code for Vertical leftover.
for (; j < w; ++j)
{
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi)
{
int ii = i - fwidth / 2;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt_s[fi];
uint16_t *ref = (uint16_t *)buf.ref;
uint16_t *dis = (uint16_t *)buf.dis;
accum_ref += fcoeff * ((uint32_t)ref[ii_check * stride_v + j]);
accum_dis += fcoeff * ((uint32_t)dis[ii_check * stride_v + j]);
}
buf.tmp.ref_convol[j] = (uint16_t)((accum_ref + add_shift_round_VP) >> shift_VP);
buf.tmp.dis_convol[j] = (uint16_t)((accum_dis + add_shift_round_VP) >> shift_VP);
}
PADDING_SQ_DATA_2(buf, w, fwidth / 2);
// HORIZONTAL
uint32_t *pRefConv = (uint32_t *)buf.tmp.ref_convol - (fwidth / 2);
uint32_t *pDisConv = (uint32_t *)buf.tmp.dis_convol - (fwidth / 2);
j = 0;
for (; j < uiw15; j += 16, pRefConv += 16, pDisConv += 16)
{
NEON_FILTER_INSTANCE_AND_LOAD_U32X4_LU4(ref_conv_vec_u32, pRefConv);
NEON_FILTER_INSTANCE_AND_LOAD_U32X4_LU4(dis_conv_vec_u32, pDisConv);
NEON_FILTER_INSTANCE_U32X4_MULL_U32X4_WITH_CONST_LU4(accum_ref_conv, offset_vec_h, ref_conv_vec_u32, vif_filt_s[0]);
NEON_FILTER_INSTANCE_U32X4_MULL_U32X4_WITH_CONST_LU4(accum_dis_conv, offset_vec_h, dis_conv_vec_u32, vif_filt_s[0]);
for (unsigned fj = 1; fj < fwidth; ++fj)
{
NEON_FILTER_LOAD_U32X4_LU4(ref_conv_vec_u32, pRefConv + fj);
NEON_FILTER_LOAD_U32X4_LU4(dis_conv_vec_u32, pDisConv + fj);
NEON_FILTER_UPDATE_U32X4_ACCUM_MULL_U32X4_WITH_CONST_LU4(accum_ref_conv, ref_conv_vec_u32, vif_filt_s[fj]);
NEON_FILTER_UPDATE_U32X4_ACCUM_MULL_U32X4_WITH_CONST_LU4(accum_dis_conv, dis_conv_vec_u32, vif_filt_s[fj]);
}
NEON_FILTER_SHIFT_UNZIP_STORE_U32X4_TO_U16X8_LU2(accum_ref_conv, shift_vec_h, buf.mu1 + i_dst_stride + j);
NEON_FILTER_SHIFT_UNZIP_STORE_U32X4_TO_U16X8_LU2(accum_dis_conv, shift_vec_h, buf.mu2 + i_dst_stride + j);
}
// Scalar code for Horizontal leftover.
for (; j < w; ++j)
{
uint32_t accum_ref = 32768;
uint32_t accum_dis = 32768;
for (unsigned fj = 0; fj < fwidth; ++fj)
{
int jj = j - fwidth / 2;
int jj_check = jj + fj;
const uint16_t fcoeff = vif_filt_s[fj];
accum_ref += fcoeff * buf.tmp.ref_convol[jj_check];
accum_dis += fcoeff * buf.tmp.dis_convol[jj_check];
}
buf.mu1[i_dst_stride + j] = (uint16_t)(accum_ref >> 16);
buf.mu2[i_dst_stride + j] = (uint16_t)(accum_dis >> 16);
}
}
decimate_and_pad(buf, w, h, scale);
}