in libvmaf/src/feature/arm64/vif_neon.c [508:772]
void vif_statistic_8_neon(struct VifPublicState *s, float *num, float *den, unsigned w, unsigned h)
{
const unsigned int uiw15 = (w > 15 ? w - 15 : 0);
const unsigned int uiw7 = (w > 7 ? w - 7 : 0);
const unsigned int fwidth = vif_filter1d_width[0];
const uint16_t *vif_filt_s0 = vif_filter1d_table[0];
VifBuffer buf = s->buf;
uint16_t *log2_table = s->log2_table;
double vif_enhn_gain_limit = s->vif_enhn_gain_limit;
int64_t accum_num_log = 0.0;
int64_t accum_den_log = 0.0;
int64_t accum_num_non_log = 0;
int64_t accum_den_non_log = 0;
static const int32_t sigma_nsq = 65536 << 1;
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
const ptrdiff_t dst_stride = buf.stride_32 / sizeof(uint32_t);
ptrdiff_t i_dst_stride = 0;
const uint32x4_t offset_vec_v = vdupq_n_u32(128);
const int32x4_t shift_vec_v = vdupq_n_s32(-8);
const uint64x2_t offset_vec_h = vdupq_n_u64(32768);
const int64x2_t shift_vec_h = vdupq_n_s64(-16);
int32_t xx[8], yy[8], xy[8];
for (unsigned i = 0; i < h; ++i, i_dst_stride += dst_stride)
{
int ii = i - fwidth / 2;
const uint8_t *p_ref = ref + ii * buf.stride;
const uint8_t *p_dis = dis + ii * buf.stride;
// VERTICAL Neon
unsigned int j = 0;
for (; j < uiw15; j += 16, p_ref += 16, p_dis += 16)
{
NEON_FILTER_LOAD_U8X8_MOVE_TO_U16X8_AND_SQR_LU2(ref_vec_8u, ref_vec_16u, ref_ref_vec_16u, p_ref);
NEON_FILTER_LOAD_U8X8_MOVE_TO_U16X8_AND_SQR_LU2(dis_vec_8u, dis_vec_16u, dis_dis_vec_16u, p_dis);
NEON_FILTER_INSTANCE_U32X4_NO_INIT_MULL_U16X4_WITH_CONST_LO_HI_LU2(accum_f_ref, ref_vec_16u, vif_filt_s0[0])
NEON_FILTER_INSTANCE_U32X4_NO_INIT_MULL_U16X4_WITH_CONST_LO_HI_LU2(accum_f_dis, dis_vec_16u, vif_filt_s0[0]);
NEON_FILTER_INSTANCE_U32X4_NO_INIT_MULL_U16X4_WITH_CONST_LO_HI_LU2(accum_f_ref_ref, ref_ref_vec_16u, vif_filt_s0[0]);
NEON_FILTER_INSTANCE_U32X4_NO_INIT_MULL_U16X4_WITH_CONST_LO_HI_LU2(accum_f_dis_dis, dis_dis_vec_16u, vif_filt_s0[0]);
uint32x4_t accum_f_ref_dis_0_l = vmulq_u32(accum_f_dis_0_l, vmovl_u16(vget_low_u16(ref_vec_16u_0)));
uint32x4_t accum_f_ref_dis_0_h = vmulq_u32(accum_f_dis_0_h, vmovl_high_u16(ref_vec_16u_0));
uint32x4_t accum_f_ref_dis_1_l = vmulq_u32(accum_f_dis_1_l, vmovl_u16(vget_low_u16(ref_vec_16u_1)));
uint32x4_t accum_f_ref_dis_1_h = vmulq_u32(accum_f_dis_1_h, vmovl_high_u16(ref_vec_16u_1));
const uint8_t *pp_ref = p_ref + buf.stride;
const uint8_t *pp_dis = p_dis + buf.stride;
for (unsigned int fi = 1; fi < fwidth; ++fi, pp_ref += buf.stride, pp_dis += buf.stride)
{
NEON_FILTER_LOAD_U8X8_MOVE_TO_U16X8_AND_SQR_LU2(ref_vec_8u, ref_vec_16u, ref_ref_vec_16u, pp_ref);
NEON_FILTER_LOAD_U8X8_MOVE_TO_U16X8_AND_SQR_LU2(dis_vec_8u, dis_vec_16u, dis_dis_vec_16u, pp_dis);
NEON_FILTER_INSTANCE_U32X4_NO_INIT_MULL_U16X4_WITH_CONST_LO_HI_LU2(f_dis, dis_vec_16u, vif_filt_s0[fi]);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LU2(accum_f_ref, ref_vec_16u, vif_filt_s0[fi]);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LU2(accum_f_dis, dis_vec_16u, vif_filt_s0[fi]);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LU2(accum_f_ref_ref, ref_ref_vec_16u, vif_filt_s0[fi]);
NEON_FILTER_UPDATE_ACCUM_U32X4_WITH_CONST_LO_HI_LU2(accum_f_dis_dis, dis_dis_vec_16u, vif_filt_s0[fi]);
accum_f_ref_dis_0_l = vmlaq_u32(accum_f_ref_dis_0_l, f_dis_0_l, vmovl_u16(vget_low_u16(ref_vec_16u_0)));
accum_f_ref_dis_0_h = vmlaq_u32(accum_f_ref_dis_0_h, f_dis_0_h, vmovl_high_u16(ref_vec_16u_0));
accum_f_ref_dis_1_l = vmlaq_u32(accum_f_ref_dis_1_l, f_dis_1_l, vmovl_u16(vget_low_u16(ref_vec_16u_1)));
accum_f_ref_dis_1_h = vmlaq_u32(accum_f_ref_dis_1_h, f_dis_1_h, vmovl_high_u16(ref_vec_16u_1));
}
NEON_FILTER_OFFSET_SHIFT_STORE_U32X4_HI_LO_LU2(accum_f_ref, offset_vec_v, shift_vec_v, buf.tmp.mu1 + j);
NEON_FILTER_OFFSET_SHIFT_STORE_U32X4_HI_LO_LU2(accum_f_dis, offset_vec_v, shift_vec_v, buf.tmp.mu2 + j);
vst1q_u32(buf.tmp.ref + j, accum_f_ref_ref_0_l);
vst1q_u32(buf.tmp.ref + j + 4, accum_f_ref_ref_0_h);
vst1q_u32(buf.tmp.ref + j + 8, accum_f_ref_ref_1_l);
vst1q_u32(buf.tmp.ref + j + 12, accum_f_ref_ref_1_h);
vst1q_u32(buf.tmp.dis + j, accum_f_dis_dis_0_l);
vst1q_u32(buf.tmp.dis + j + 4, accum_f_dis_dis_0_h);
vst1q_u32(buf.tmp.dis + j + 8, accum_f_dis_dis_1_l);
vst1q_u32(buf.tmp.dis + j + 12, accum_f_dis_dis_1_h);
vst1q_u32(buf.tmp.ref_dis + j, accum_f_ref_dis_0_l);
vst1q_u32(buf.tmp.ref_dis + j + 4, accum_f_ref_dis_0_h);
vst1q_u32(buf.tmp.ref_dis + j + 8, accum_f_ref_dis_1_l);
vst1q_u32(buf.tmp.ref_dis + j + 12, accum_f_ref_dis_1_h);
}
// Scalar code for Vertical leftover.
for (; j < w; ++j)
{
uint32_t accum_mu1 = 128;
uint32_t accum_mu2 = 128;
uint32_t accum_ref = 0;
uint32_t accum_dis = 0;
uint32_t accum_ref_dis = 0;
for (unsigned fi = 0; fi < fwidth; ++fi)
{
int ii = i - fwidth / 2;
int ii_check = ii + fi;
const uint16_t fcoeff = vif_filt_s0[fi];
const uint8_t *ref = (uint8_t *)buf.ref;
const uint8_t *dis = (uint8_t *)buf.dis;
uint16_t imgcoeff_ref = ref[ii_check * buf.stride + j];
uint16_t imgcoeff_dis = dis[ii_check * buf.stride + j];
uint32_t img_coeff_ref = fcoeff * (uint32_t)imgcoeff_ref;
uint32_t img_coeff_dis = fcoeff * (uint32_t)imgcoeff_dis;
accum_mu1 += img_coeff_ref;
accum_mu2 += img_coeff_dis;
accum_ref += img_coeff_ref * (uint32_t)imgcoeff_ref;
accum_dis += img_coeff_dis * (uint32_t)imgcoeff_dis;
accum_ref_dis += img_coeff_ref * (uint32_t)imgcoeff_dis;
}
buf.tmp.mu1[j] = accum_mu1 >> 8;
buf.tmp.mu2[j] = accum_mu2 >> 8;
buf.tmp.ref[j] = accum_ref;
buf.tmp.dis[j] = accum_dis;
buf.tmp.ref_dis[j] = accum_ref_dis;
}
PADDING_SQ_DATA(buf, w, fwidth / 2);
// HORIZONTAL
uint32_t *pMul1 = (uint32_t *)buf.tmp.mu1 - (fwidth / 2);
uint32_t *pMul2 = (uint32_t *)buf.tmp.mu2 - (fwidth / 2);
uint32_t *pRef = (uint32_t *)buf.tmp.ref - (fwidth / 2);
uint32_t *pDis = (uint32_t *)buf.tmp.dis - (fwidth / 2);
uint32_t *pRefDis = (uint32_t *)buf.tmp.ref_dis - (fwidth / 2);
j = 0;
for (; j < uiw7; j += 8, pMul1 += 8, pMul2 += 8, pDis += 8, pRef += 8, pRefDis += 8)
{
uint32x4_t mul1_vec_u32_0 = vld1q_u32(pMul1);
uint32x4_t mul2_vec_u32_0 = vld1q_u32(pMul2);
uint32x4_t ref_vec_u32_0 = vld1q_u32(pRef);
uint32x4_t dis_vec_u32_0 = vld1q_u32(pDis);
uint32x4_t ref_dis_vec_u32_0 = vld1q_u32(pRefDis);
uint32x4_t mul1_vec_u32_1 = vld1q_u32(pMul1 + 4);
uint32x4_t mul2_vec_u32_1 = vld1q_u32(pMul2 + 4);
uint32x4_t ref_vec_u32_1 = vld1q_u32(pRef + 4);
uint32x4_t dis_vec_u32_1 = vld1q_u32(pDis + 4);
uint32x4_t ref_dis_vec_u32_1 = vld1q_u32(pRefDis + 4);
uint32x4_t accum_mu1_0 = vmulq_n_u32(mul1_vec_u32_0, vif_filt_s0[0]);
uint32x4_t accum_mu2_0 = vmulq_n_u32(mul2_vec_u32_0, vif_filt_s0[0]);
uint32x4_t accum_mu1_1 = vmulq_n_u32(mul1_vec_u32_1, vif_filt_s0[0]);
uint32x4_t accum_mu2_1 = vmulq_n_u32(mul2_vec_u32_1, vif_filt_s0[0]);
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_U32X2_WITH_CONST_LO_HI_LU2(accum_ref, offset_vec_h, ref_vec_u32, vif_filt_s0[0]);
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_U32X2_WITH_CONST_LO_HI_LU2(accum_dis, offset_vec_h, dis_vec_u32, vif_filt_s0[0]);
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_U32X2_WITH_CONST_LO_HI_LU2(accum_ref_dis, offset_vec_h, ref_dis_vec_u32, vif_filt_s0[0]);
for (unsigned fj = 1; fj < fwidth; ++fj)
{
mul1_vec_u32_0 = vld1q_u32(pMul1 + fj);
mul2_vec_u32_0 = vld1q_u32(pMul2 + fj);
ref_vec_u32_0 = vld1q_u32(pRef + fj);
dis_vec_u32_0 = vld1q_u32(pDis + fj);
ref_dis_vec_u32_0 = vld1q_u32(pRefDis + fj);
mul1_vec_u32_1 = vld1q_u32(pMul1 + 4 + fj);
mul2_vec_u32_1 = vld1q_u32(pMul2 + 4 + fj);
ref_vec_u32_1 = vld1q_u32(pRef + 4 + fj);
dis_vec_u32_1 = vld1q_u32(pDis + 4 + fj);
ref_dis_vec_u32_1 = vld1q_u32(pRefDis + 4 + fj);
accum_mu1_0 = vmlaq_n_u32(accum_mu1_0, mul1_vec_u32_0, vif_filt_s0[fj]);
accum_mu2_0 = vmlaq_n_u32(accum_mu2_0, mul2_vec_u32_0, vif_filt_s0[fj]);
accum_mu1_1 = vmlaq_n_u32(accum_mu1_1, mul1_vec_u32_1, vif_filt_s0[fj]);
accum_mu2_1 = vmlaq_n_u32(accum_mu2_1, mul2_vec_u32_1, vif_filt_s0[fj]);
NEON_FILTER_UPDATE_ACCUM_U64X2_WITH_CONST_LO_HI_LU2(accum_ref, ref_vec_u32, vif_filt_s0[fj]);
NEON_FILTER_UPDATE_ACCUM_U64X2_WITH_CONST_LO_HI_LU2(accum_dis, dis_vec_u32, vif_filt_s0[fj]);
NEON_FILTER_UPDATE_ACCUM_U64X2_WITH_CONST_LO_HI_LU2(accum_ref_dis, ref_dis_vec_u32, vif_filt_s0[fj]);
}
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu1_sq_vec_l, vdupq_n_u64(2147483648), accum_mu1_0, accum_mu1_0, vdupq_n_s64(-32));
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu1_sq_vec_h, vdupq_n_u64(2147483648), accum_mu1_1, accum_mu1_1, vdupq_n_s64(-32));
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu2_sq_vec_l, vdupq_n_u64(2147483648), accum_mu2_0, accum_mu2_0, vdupq_n_s64(-32));
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu2_sq_vec_h, vdupq_n_u64(2147483648), accum_mu2_1, accum_mu2_1, vdupq_n_s64(-32));
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu1_mu2_sq_vec_l, vdupq_n_u64(2147483648), accum_mu1_0, accum_mu2_0, vdupq_n_s64(-32));
NEON_FILTER_INSTANCE_U64X2_INIT_MULL_SHIFT_UNZIP_U32X4_LO_HI(mu1_mu2_sq_vec_h, vdupq_n_u64(2147483648), accum_mu1_1, accum_mu2_1, vdupq_n_s64(-32));
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_ref_0, shift_vec_h, xx_filt_vec_l);
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_ref_1, shift_vec_h, xx_filt_vec_h);
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_dis_0, shift_vec_h, yy_filt_vec_l);
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_dis_1, shift_vec_h, yy_filt_vec_h);
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_ref_dis_0, shift_vec_h, xy_filt_vec_l);
NEON_FILTER_SHIFT_UNZIP_U64X2_TO_U32X4_LO_HI(accum_ref_dis_1, shift_vec_h, xy_filt_vec_h);
int32x4_t sigma1_sq_vec_l = vreinterpretq_s32_u32(vsubq_u32(xx_filt_vec_l,mu1_sq_vec_l));
int32x4_t sigma1_sq_vec_h = vreinterpretq_s32_u32(vsubq_u32(xx_filt_vec_h,mu1_sq_vec_h));
int32x4_t sigma2_sq_vec_l = vreinterpretq_s32_u32(vsubq_u32(yy_filt_vec_l,mu2_sq_vec_l));
int32x4_t sigma2_sq_vec_h = vreinterpretq_s32_u32(vsubq_u32(yy_filt_vec_h,mu2_sq_vec_h));
int32x4_t sigma12_vec_l = vreinterpretq_s32_u32(vsubq_u32(xy_filt_vec_l,mu1_mu2_sq_vec_l));
int32x4_t sigma12_vec_h = vreinterpretq_s32_u32(vsubq_u32(xy_filt_vec_h,mu1_mu2_sq_vec_h));
vst1q_s32(xx, sigma1_sq_vec_l);
vst1q_s32(xx + 4, sigma1_sq_vec_h);
vst1q_s32(yy, sigma2_sq_vec_l);
vst1q_s32(yy + 4, sigma2_sq_vec_h);
vst1q_s32(xy, sigma12_vec_l);
vst1q_s32(xy + 4, sigma12_vec_h);
for (unsigned int b = 0; b < 8; b++) {
int32_t sigma1_sq = xx[b];
int32_t sigma2_sq = yy[b];
int32_t sigma12 = xy[b];
if (sigma1_sq >= sigma_nsq) {
/**
* log values are taken from the look-up table generated by
* log_generate() function which is called in integer_combo_threadfunc
* den_val in float is log2(1 + sigma1_sq/2)
* here it is converted to equivalent of log2(2+sigma1_sq) - log2(2) i.e log2(2*65536+sigma1_sq) - 17
* multiplied by 2048 as log_value = log2(i)*2048 i=16384 to 65535 generated using log_value
* x because best 16 bits are taken
*/
accum_den_log += log2_32(log2_table, sigma_nsq + sigma1_sq) - 2048 * 17;
if (sigma12 > 0 && sigma2_sq > 0)
{
// num_val = log2f(1.0f + (g * g * sigma1_sq) / (sv_sq + sigma_nsq));
/**
* In floating-point numerator = log2((1.0f + (g * g * sigma1_sq)/(sv_sq + sigma_nsq))
*
* In Fixed-point the above is converted to
* numerator = log2((sv_sq + sigma_nsq)+(g * g * sigma1_sq))- log2(sv_sq + sigma_nsq)
*/
const double eps = 65536 * 1.0e-10;
double g = sigma12 / (sigma1_sq + eps); // this epsilon can go away
int32_t sv_sq = sigma2_sq - g * sigma12;
sv_sq = (uint32_t)(MAX(sv_sq, 0));
g = MIN(g, vif_enhn_gain_limit);
uint32_t numer1 = (sv_sq + sigma_nsq);
int64_t numer1_tmp = (int64_t)((g * g * sigma1_sq)) + numer1; //numerator
accum_num_log += log2_64(log2_table, numer1_tmp) - log2_64(log2_table, numer1);
}
}
else {
accum_num_non_log += sigma2_sq;
accum_den_non_log++;
}
}
}
}
num[0] = accum_num_log / 2048.0 + (accum_den_non_log - ((accum_num_non_log) / 16384.0) / (65025.0));
den[0] = accum_den_log / 2048.0 + accum_den_non_log;
}