void jsimd_h2v2_fancy_upsample_neon()

in simd/arm/jdsample-neon.c [205:375]


void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
                                    JDIMENSION downsampled_width,
                                    JSAMPARRAY input_data,
                                    JSAMPARRAY *output_data_ptr)
{
  JSAMPARRAY output_data = *output_data_ptr;
  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
  int inrow, outrow;
  unsigned colctr;
  /* Set up constants. */
  const uint16x8_t seven_u16 = vdupq_n_u16(7);
  const uint8x8_t three_u8 = vdup_n_u8(3);
  const uint16x8_t three_u16 = vdupq_n_u16(3);

  inrow = outrow = 0;
  while (outrow < max_v_samp_factor) {
    inptr0 = input_data[inrow - 1];
    inptr1 = input_data[inrow];
    inptr2 = input_data[inrow + 1];
    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
     * respectively.
     */
    outptr0 = output_data[outrow++];
    outptr1 = output_data[outrow++];

    /* First pixel component value in this row of the original image */
    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);

    /* Step 1: Blend samples vertically in columns s0 and s1.
     * Leave the divide by 4 until the end, when it can be done for both
     * dimensions at once, right-shifting by 4.
     */

    /* Load and compute s0colsum0 and s0colsum1. */
    uint8x16_t s0A = vld1q_u8(inptr0);
    uint8x16_t s0B = vld1q_u8(inptr1);
    uint8x16_t s0C = vld1q_u8(inptr2);
    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
     * denote low half and high half respectively.
     */
    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
                                      vget_low_u8(s0B), three_u8);
    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
                                      vget_high_u8(s0B), three_u8);
    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
                                      vget_low_u8(s0B), three_u8);
    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
                                      vget_high_u8(s0B), three_u8);
    /* Load and compute s1colsum0 and s1colsum1. */
    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
                                      vget_low_u8(s1B), three_u8);
    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
                                      vget_high_u8(s1B), three_u8);
    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
                                      vget_low_u8(s1B), three_u8);
    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
                                      vget_high_u8(s1B), three_u8);

    /* Step 2: Blend the already-blended columns. */

    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
    /* Add ordered dithering bias to odd pixel values. */
    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
    uint8x16x2_t output_pixels0 = { {
      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
    } };
    uint8x16x2_t output_pixels1 = { {
      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
    } };

    /* Store pixel component values to memory.
     * The minimum size of the output buffer for each row is 64 bytes => no
     * need to worry about buffer overflow here.  See "Creation of 2-D sample
     * arrays" in jmemmgr.c for more details.
     */
    vst2q_u8(outptr0 + 1, output_pixels0);
    vst2q_u8(outptr1 + 1, output_pixels1);

    /* The first pixel of the image shifted our loads and stores by one byte.
     * We have to re-align on a 32-byte boundary at some point before the end
     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
     * bounds of the sample buffers without having to resort to a slow scalar
     * tail case for the last (downsampled_width % 16) samples.  See "Creation
     * of 2-D sample arrays" in jmemmgr.c for more details.
     */
    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
      /* Step 1: Blend samples vertically in columns s0 and s1. */

      /* Load and compute s0colsum0 and s0colsum1. */
      s0A = vld1q_u8(inptr0 + colctr - 1);
      s0B = vld1q_u8(inptr1 + colctr - 1);
      s0C = vld1q_u8(inptr2 + colctr - 1);
      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
                             three_u8);
      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
                             three_u8);
      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
                             three_u8);
      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
                             three_u8);
      /* Load and compute s1colsum0 and s1colsum1. */
      s1A = vld1q_u8(inptr0 + colctr);
      s1B = vld1q_u8(inptr1 + colctr);
      s1C = vld1q_u8(inptr2 + colctr);
      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
                             three_u8);
      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
                             three_u8);
      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
                             three_u8);
      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
                             three_u8);

      /* Step 2: Blend the already-blended columns. */

      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
      /* Add ordered dithering bias to odd pixel values. */
      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
                                          vshrn_n_u16(output0_p1_h, 4));
      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
                                          vrshrn_n_u16(output0_p2_h, 4));
      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
                                          vshrn_n_u16(output1_p1_h, 4));
      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
                                          vrshrn_n_u16(output1_p2_h, 4));
      /* Store pixel component values to memory. */
      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
    }

    /* Last pixel component value in this row of the original image */
    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
                    GETJSAMPLE(inptr0[downsampled_width - 1]);
    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
                    GETJSAMPLE(inptr2[downsampled_width - 1]);
    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
    inrow++;
  }
}