void rsdIntrinsicYuv_K()

in renderscript-toolkit/src/main/cpp/x86.cpp [412:472]


void rsdIntrinsicYuv_K(void *dst,
                       const unsigned char *pY, const unsigned char *pUV,
                       uint32_t count, const short *param) {
    __m128i biasY, biasUV;
    __m128i c0, c1, c2, c3, c4;

    biasY = _mm_set1_epi32(param[8]);   /*  16 */
    biasUV = _mm_set1_epi32(param[16]); /* 128 */

    c0 = _mm_set1_epi32(param[0]);  /*  298 */
    c1 = _mm_set1_epi32(param[1]);  /*  409 */
    c2 = _mm_set1_epi32(param[2]);  /* -100 */
    c3 = _mm_set1_epi32(param[3]);  /*  516 */
    c4 = _mm_set1_epi32(param[4]);  /* -208 */

    __m128i Y, UV, U, V, R, G, B, A;

    A = _mm_set1_epi32(255);
    uint32_t i;

    for (i = 0; i < (count << 1); ++i) {
        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));

        Y = _mm_sub_epi32(Y, biasY);
        UV = _mm_sub_epi32(UV, biasUV);

        U = _mm_shuffle_epi32(UV, 0xf5);
        V = _mm_shuffle_epi32(UV, 0xa0);

        Y = mullo_epi32(Y, c0);

        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
        R = _mm_add_epi32(R, biasUV);
        R = _mm_srai_epi32(R, 8);

        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
        G = _mm_add_epi32(G, mullo_epi32(V, c4));
        G = _mm_add_epi32(G, biasUV);
        G = _mm_srai_epi32(G, 8);

        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
        B = _mm_add_epi32(B, biasUV);
        B = _mm_srai_epi32(B, 8);

        __m128i y1, y2, y3, y4;

        y1 = packus_epi32(R, G);
        y2 = packus_epi32(B, A);
        y3 = _mm_packus_epi16(y1, y2);
        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
                                          14, 10, 6, 2,
                                          13,  9, 5, 1,
                                          12,  8, 4, 0);
        y4 = _mm_shuffle_epi8(y3, T4x4);
        _mm_storeu_si128((__m128i *)dst, y4);
        pY += 4;
        pUV += 4;
        dst = (__m128i *)dst + 1;
    }
}