in Support/STBImage/include/stb_image.h [3542:3673]
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
{
int i = 0;
#ifdef STBI_SSE2
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
// it's useful in practice (you wouldn't use it for textures, for example).
// so just accelerate step == 4 case.
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
__m128i signflip = _mm_set1_epi8(-0x80);
__m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f));
__m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
__m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
__m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f));
__m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
__m128i xw = _mm_set1_epi16(255); // alpha channel
for (; i+7 < count; i += 8) {
// load
__m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
__m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
__m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
__m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
__m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
// unpack to short (and left-shift cr, cb by 8)
__m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
__m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
__m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
// color transform
__m128i yws = _mm_srli_epi16(yw, 4);
__m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
__m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
__m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
__m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
__m128i rws = _mm_add_epi16(cr0, yws);
__m128i gwt = _mm_add_epi16(cb0, yws);
__m128i bws = _mm_add_epi16(yws, cb1);
__m128i gws = _mm_add_epi16(gwt, cr1);
// descale
__m128i rw = _mm_srai_epi16(rws, 4);
__m128i bw = _mm_srai_epi16(bws, 4);
__m128i gw = _mm_srai_epi16(gws, 4);
// back to byte, set up for transpose
__m128i brb = _mm_packus_epi16(rw, bw);
__m128i gxb = _mm_packus_epi16(gw, xw);
// transpose to interleave channels
__m128i t0 = _mm_unpacklo_epi8(brb, gxb);
__m128i t1 = _mm_unpackhi_epi8(brb, gxb);
__m128i o0 = _mm_unpacklo_epi16(t0, t1);
__m128i o1 = _mm_unpackhi_epi16(t0, t1);
// store
_mm_storeu_si128((__m128i *) (out + 0), o0);
_mm_storeu_si128((__m128i *) (out + 16), o1);
out += 32;
}
}
#endif
#ifdef STBI_NEON
// in this version, step=3 support would be easy to add. but is there demand?
if (step == 4) {
// this is a fairly straightforward implementation and not super-optimized.
uint8x8_t signflip = vdup_n_u8(0x80);
int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
for (; i+7 < count; i += 8) {
// load
uint8x8_t y_bytes = vld1_u8(y + i);
uint8x8_t cr_bytes = vld1_u8(pcr + i);
uint8x8_t cb_bytes = vld1_u8(pcb + i);
int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
// expand to s16
int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
int16x8_t crw = vshll_n_s8(cr_biased, 7);
int16x8_t cbw = vshll_n_s8(cb_biased, 7);
// color transform
int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
int16x8_t rws = vaddq_s16(yws, cr0);
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
int16x8_t bws = vaddq_s16(yws, cb1);
// undo scaling, round, convert to byte
uint8x8x4_t o;
o.val[0] = vqrshrun_n_s16(rws, 4);
o.val[1] = vqrshrun_n_s16(gws, 4);
o.val[2] = vqrshrun_n_s16(bws, 4);
o.val[3] = vdup_n_u8(255);
// store, interleaving r/g/b/a
vst4_u8(out, o);
out += 8*4;
}
}
#endif
for (; i < count; ++i) {
int y_fixed = (y[i] << 20) + (1<<19); // rounding
int r,g,b;
int cr = pcr[i] - 128;
int cb = pcb[i] - 128;
r = y_fixed + cr* stbi__float2fixed(1.40200f);
g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
b = y_fixed + cb* stbi__float2fixed(1.77200f);
r >>= 20;
g >>= 20;
b >>= 20;
if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
out[0] = (stbi_uc)r;
out[1] = (stbi_uc)g;
out[2] = (stbi_uc)b;
out[3] = 255;
out += step;
}
}