void rsdIntrinsicBlendSrcAtop_K()

in renderscript-toolkit/src/main/cpp/x86.cpp [1079:1148]


void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
    __m128i all1s, ina, outa, ins, outs;
    __m128i in0, in1, out0, out1;
    __m128i t0, t1, t2, t3;
    uint32_t i;

    all1s = _mm_set1_epi16(255);

    for (i = 0; i < count8; ++i) {
        in0 = _mm_loadu_si128((const __m128i *)src);
        in1 = _mm_loadu_si128((const __m128i *)src + 1);
        out0 = _mm_loadu_si128((const __m128i *)dst);
        out1 = _mm_loadu_si128((const __m128i *)dst + 1);

        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
        ina = _mm_shufflelo_epi16(ins, 0xFF);
        ina = _mm_shufflehi_epi16(ina, 0xFF);
        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
        outa = _mm_shufflelo_epi16(outs, 0xFF);
        outa = _mm_shufflehi_epi16(outa, 0xFF);
        t0 = _mm_sub_epi16(all1s, ina);
        t0 = _mm_mullo_epi16(t0, outs);
        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
        t0 = _mm_srli_epi16(t0, 8);

        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
        ina = _mm_shufflelo_epi16(ins, 0xFF);
        ina = _mm_shufflehi_epi16(ina, 0xFF);
        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
        outa = _mm_shufflelo_epi16(outs, 0xFF);
        outa = _mm_shufflehi_epi16(outa, 0xFF);
        t1 = _mm_sub_epi16(all1s, ina);
        t1 = _mm_mullo_epi16(t1, outs);
        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
        t1 = _mm_srli_epi16(t1, 8);

        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
        ina = _mm_shufflelo_epi16(ins, 0xFF);
        ina = _mm_shufflehi_epi16(ina, 0xFF);
        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
        outa = _mm_shufflelo_epi16(outs, 0xFF);
        outa = _mm_shufflehi_epi16(outa, 0xFF);
        t2 = _mm_sub_epi16(all1s, ina);
        t2 = _mm_mullo_epi16(t2, outs);
        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
        t2 = _mm_srli_epi16(t2, 8);

        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
        ina = _mm_shufflelo_epi16(ins, 0xFF);
        ina = _mm_shufflehi_epi16(ina, 0xFF);
        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
        outa = _mm_shufflelo_epi16(outs, 0xFF);
        outa = _mm_shufflehi_epi16(outa, 0xFF);
        t3 = _mm_sub_epi16(all1s, ina);
        t3 = _mm_mullo_epi16(t3, outs);
        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
        t3 = _mm_srli_epi16(t3, 8);

        t0 = _mm_packus_epi16(t0, t1);
        t0 = blendv_epi8(t0, out0, M0001);
        t2 = _mm_packus_epi16(t2, t3);
        t2 = blendv_epi8(t2, out1, M0001);
        _mm_storeu_si128((__m128i *)dst, t0);
        _mm_storeu_si128((__m128i *)dst + 1, t2);

        src = (const __m128i *)src + 2;
        dst = (__m128i *)dst + 2;
    }
}