in renderscript-toolkit/src/main/cpp/x86.cpp [412:472]
void rsdIntrinsicYuv_K(void *dst,
const unsigned char *pY, const unsigned char *pUV,
uint32_t count, const short *param) {
__m128i biasY, biasUV;
__m128i c0, c1, c2, c3, c4;
biasY = _mm_set1_epi32(param[8]); /* 16 */
biasUV = _mm_set1_epi32(param[16]); /* 128 */
c0 = _mm_set1_epi32(param[0]); /* 298 */
c1 = _mm_set1_epi32(param[1]); /* 409 */
c2 = _mm_set1_epi32(param[2]); /* -100 */
c3 = _mm_set1_epi32(param[3]); /* 516 */
c4 = _mm_set1_epi32(param[4]); /* -208 */
__m128i Y, UV, U, V, R, G, B, A;
A = _mm_set1_epi32(255);
uint32_t i;
for (i = 0; i < (count << 1); ++i) {
Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
Y = _mm_sub_epi32(Y, biasY);
UV = _mm_sub_epi32(UV, biasUV);
U = _mm_shuffle_epi32(UV, 0xf5);
V = _mm_shuffle_epi32(UV, 0xa0);
Y = mullo_epi32(Y, c0);
R = _mm_add_epi32(Y, mullo_epi32(V, c1));
R = _mm_add_epi32(R, biasUV);
R = _mm_srai_epi32(R, 8);
G = _mm_add_epi32(Y, mullo_epi32(U, c2));
G = _mm_add_epi32(G, mullo_epi32(V, c4));
G = _mm_add_epi32(G, biasUV);
G = _mm_srai_epi32(G, 8);
B = _mm_add_epi32(Y, mullo_epi32(U, c3));
B = _mm_add_epi32(B, biasUV);
B = _mm_srai_epi32(B, 8);
__m128i y1, y2, y3, y4;
y1 = packus_epi32(R, G);
y2 = packus_epi32(B, A);
y3 = _mm_packus_epi16(y1, y2);
const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0);
y4 = _mm_shuffle_epi8(y3, T4x4);
_mm_storeu_si128((__m128i *)dst, y4);
pY += 4;
pUV += 4;
dst = (__m128i *)dst + 1;
}
}