in renderscript-toolkit/src/main/cpp/x86.cpp [1150:1219]
void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
__m128i all1s, ina, ins, outa, outs;
__m128i in0, in1, out0, out1;
__m128i t0, t1, t2, t3;
uint32_t i;
all1s = _mm_set1_epi16(255);
for (i = 0; i < count8; ++i) {
in0 = _mm_loadu_si128((const __m128i *)src);
in1 = _mm_loadu_si128((const __m128i *)src + 1);
out0 = _mm_loadu_si128((const __m128i *)dst);
out1 = _mm_loadu_si128((const __m128i *)dst + 1);
ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ins, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outs, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t0 = _mm_sub_epi16(all1s, outa);
t0 = _mm_mullo_epi16(t0, ins);
t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
t0 = _mm_srli_epi16(t0, 8);
ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ins, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outs, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t1 = _mm_sub_epi16(all1s, outa);
t1 = _mm_mullo_epi16(t1, ins);
t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
t1 = _mm_srli_epi16(t1, 8);
ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ins, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outs, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t2 = _mm_sub_epi16(all1s, outa);
t2 = _mm_mullo_epi16(t2, ins);
t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
t2 = _mm_srli_epi16(t2, 8);
ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
ina = _mm_shufflelo_epi16(ins, 0xFF);
ina = _mm_shufflehi_epi16(ina, 0xFF);
outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
outa = _mm_shufflelo_epi16(outs, 0xFF);
outa = _mm_shufflehi_epi16(outa, 0xFF);
t3 = _mm_sub_epi16(all1s, outa);
t3 = _mm_mullo_epi16(t3, ins);
t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
t3 = _mm_srli_epi16(t3, 8);
t0 = _mm_packus_epi16(t0, t1);
t0 = blendv_epi8(t0, in0, M0001);
t2 = _mm_packus_epi16(t2, t3);
t2 = blendv_epi8(t2, in1, M0001);
_mm_storeu_si128((__m128i *)dst, t0);
_mm_storeu_si128((__m128i *)dst + 1, t2);
src = (const __m128i *)src + 2;
dst = (__m128i *)dst + 2;
}
}