in simd/arm/jdmrgext-neon.c [61:336]
void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr,
JSAMPARRAY output_buf)
{
JSAMPROW outptr;
/* Pointers to Y, Cb, and Cr data */
JSAMPROW inptr0, inptr1, inptr2;
const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
const int16x8_t neg_128 = vdupq_n_s16(-128);
inptr0 = input_buf[0][in_row_group_ctr];
inptr1 = input_buf[1][in_row_group_ctr];
inptr2 = input_buf[2][in_row_group_ctr];
outptr = output_buf[0];
int cols_remaining = output_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x16x4_t rgba;
rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Set alpha channel to opaque (0xFF). */
rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
vst4q_u8(outptr, rgba);
#else
uint8x16x3_t rgb;
rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
/* Store RGB pixel data to memory. */
vst3q_u8(outptr, rgb);
#endif
/* Increment pointers. */
inptr0 += 16;
inptr1 += 8;
inptr2 += 8;
outptr += (RGB_PIXELSIZE * 16);
}
if (cols_remaining > 0) {
/* De-interleave Y component values into two separate vectors, one
* containing the component values with even-numbered indices and one
* containing the component values with odd-numbered indices.
*/
uint8x8x2_t y = vld2_u8(inptr0);
uint8x8_t cb = vld1_u8(inptr1);
uint8x8_t cr = vld1_u8(inptr2);
/* Subtract 128 from Cb and Cr. */
int16x8_t cr_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
int16x8_t cb_128 =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
/* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
/* Descale G components: shift right 15, round, and narrow to 16-bit. */
int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
vrshrn_n_s32(g_sub_y_h, 15));
/* Compute R-Y: 1.40200 * (Cr - 128) */
int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
/* Compute B-Y: 1.77200 * (Cb - 128) */
int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
/* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
* "odd" Y component values. This effectively upsamples the chroma
* components horizontally.
*/
int16x8_t g_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[0]));
int16x8_t r_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[0]));
int16x8_t b_even =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[0]));
int16x8_t g_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
y.val[1]));
int16x8_t r_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
y.val[1]));
int16x8_t b_odd =
vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
y.val[1]));
/* Convert each component to unsigned and narrow, clamping to [0-255].
* Re-interleave the "even" and "odd" component values.
*/
uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
#ifdef RGB_ALPHA
uint8x8x4_t rgba_h;
rgba_h.val[RGB_RED] = r.val[1];
rgba_h.val[RGB_GREEN] = g.val[1];
rgba_h.val[RGB_BLUE] = b.val[1];
/* Set alpha channel to opaque (0xFF). */
rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
uint8x8x4_t rgba_l;
rgba_l.val[RGB_RED] = r.val[0];
rgba_l.val[RGB_GREEN] = g.val[0];
rgba_l.val[RGB_BLUE] = b.val[0];
/* Set alpha channel to opaque (0xFF). */
rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
/* Store RGBA pixel data to memory. */
switch (cols_remaining) {
case 15:
vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst4_u8(outptr, rgba_l);
break;
case 7:
vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst4_lane_u8(outptr, rgba_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#else
uint8x8x3_t rgb_h;
rgb_h.val[RGB_RED] = r.val[1];
rgb_h.val[RGB_GREEN] = g.val[1];
rgb_h.val[RGB_BLUE] = b.val[1];
uint8x8x3_t rgb_l;
rgb_l.val[RGB_RED] = r.val[0];
rgb_l.val[RGB_GREEN] = g.val[0];
rgb_l.val[RGB_BLUE] = b.val[0];
/* Store RGB pixel data to memory. */
switch (cols_remaining) {
case 15:
vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 14:
vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 13:
vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 12:
vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 11:
vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 10:
vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 9:
vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
FALLTHROUGH /*FALLTHROUGH*/
case 8:
vst3_u8(outptr, rgb_l);
break;
case 7:
vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
FALLTHROUGH /*FALLTHROUGH*/
case 6:
vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
FALLTHROUGH /*FALLTHROUGH*/
case 5:
vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
FALLTHROUGH /*FALLTHROUGH*/
case 4:
vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
FALLTHROUGH /*FALLTHROUGH*/
case 3:
vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
FALLTHROUGH /*FALLTHROUGH*/
case 2:
vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
FALLTHROUGH /*FALLTHROUGH*/
case 1:
vst3_lane_u8(outptr, rgb_l, 0);
FALLTHROUGH /*FALLTHROUGH*/
default:
break;
}
#endif
}
}