in Extended/libwebp/src/dsp/enc_sse2.c [1220:1330]
static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
// extract sign(in) (0x0000 if positive, 0xffff if negative)
const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
// coeff = abs(in) = (in ^ sign) - sign
coeff0 = _mm_xor_si128(in0, sign0);
coeff8 = _mm_xor_si128(in8, sign8);
coeff0 = _mm_sub_epi16(coeff0, sign0);
coeff8 = _mm_sub_epi16(coeff8, sign8);
// coeff = abs(in) + sharpen
if (sharpen != NULL) {
const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
// out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// out = (coeff * iQ + B)
const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
// if (coeff > 2047) coeff = 2047
out0 = _mm_min_epi16(out0, max_coeff_2047);
out8 = _mm_min_epi16(out8, max_coeff_2047);
}
// get sign back (if (sign[j]) out_n = -out_n)
out0 = _mm_xor_si128(out0, sign0);
out8 = _mm_xor_si128(out8, sign8);
out0 = _mm_sub_epi16(out0, sign0);
out8 = _mm_sub_epi16(out8, sign8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
// zigzag the output before storing it.
//
// The zigzag pattern can almost be reproduced with a small sequence of
// shuffles. After it, we only need to swap the 7th (ending up in third
// position instead of twelfth) and 8th values.
{
__m128i outZ0, outZ8;
outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0));
outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1));
outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
_mm_storeu_si128((__m128i*)&out[0], outZ0);
_mm_storeu_si128((__m128i*)&out[8], outZ8);
packed_out = _mm_packs_epi16(outZ0, outZ8);
}
{
const int16_t outZ_12 = out[12];
const int16_t outZ_3 = out[3];
out[3] = outZ_12;
out[12] = outZ_3;
}
// detect if all 'out' values are zeroes or not
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}