static INLINE void jsimd_idct_islow_pass1_regular()

in simd/arm/jidctint-neon.c [96:322]


static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
                                                  int16x4_t row1,
                                                  int16x4_t row2,
                                                  int16x4_t row3,
                                                  int16x4_t row4,
                                                  int16x4_t row5,
                                                  int16x4_t row6,
                                                  int16x4_t row7,
                                                  int16x4_t quant_row0,
                                                  int16x4_t quant_row1,
                                                  int16x4_t quant_row2,
                                                  int16x4_t quant_row3,
                                                  int16x4_t quant_row4,
                                                  int16x4_t quant_row5,
                                                  int16x4_t quant_row6,
                                                  int16x4_t quant_row7,
                                                  int16_t *workspace_1,
                                                  int16_t *workspace_2);

static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
                                                 int16x4_t row1,
                                                 int16x4_t row2,
                                                 int16x4_t row3,
                                                 int16x4_t quant_row0,
                                                 int16x4_t quant_row1,
                                                 int16x4_t quant_row2,
                                                 int16x4_t quant_row3,
                                                 int16_t *workspace_1,
                                                 int16_t *workspace_2);

static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
                                                  JSAMPARRAY output_buf,
                                                  JDIMENSION output_col,
                                                  unsigned buf_offset);

static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
                                                 JSAMPARRAY output_buf,
                                                 JDIMENSION output_col,
                                                 unsigned buf_offset);


/* Perform dequantization and inverse DCT on one block of coefficients.  For
 * reference, the C implementation (jpeg_idct_slow()) can be found in
 * jidctint.c.
 *
 * Optimization techniques used for fast data access:
 *
 * In each pass, the inverse DCT is computed for the left and right 4x8 halves
 * of the DCT block.  This avoids spilling due to register pressure, and the
 * increased granularity allows for an optimized calculation depending on the
 * values of the DCT coefficients.  Between passes, intermediate data is stored
 * in 4x8 workspace buffers.
 *
 * Transposing the 8x8 DCT block after each pass can be achieved by transposing
 * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
 * diagram below.)  Swapping quadrants is cheap, since the second pass can just
 * swap the workspace buffer pointers.
 *
 *      +-------+-------+                   +-------+-------+
 *      |       |       |                   |       |       |
 *      |   0   |   1   |                   |   0   |   2   |
 *      |       |       |    transpose      |       |       |
 *      +-------+-------+     ------>       +-------+-------+
 *      |       |       |                   |       |       |
 *      |   2   |   3   |                   |   1   |   3   |
 *      |       |       |                   |       |       |
 *      +-------+-------+                   +-------+-------+
 *
 * Optimization techniques used to accelerate the inverse DCT calculation:
 *
 * In a DCT coefficient block, the coefficients are increasingly likely to be 0
 * as you move diagonally from top left to bottom right.  If whole rows of
 * coefficients are 0, then the inverse DCT calculation can be simplified.  On
 * the first pass of the inverse DCT, we test for three special cases before
 * defaulting to a full "regular" inverse DCT:
 *
 * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
 *    "sparse" simplified inverse DCT on rows 0-3.
 * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
 *    result is equal to the dequantized DC coefficients.
 * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
 *    result is all zero.  For the left 4x8 half, this is handled identically
 *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
 *    the "sparse" algorithm is required for the second pass.
 *
 * In the second pass, only a single special case is tested: whether the AC and
 * DC coefficients were all zero in the right 4x8 block during the first pass
 * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
 * the second pass is performed for both the left and right halves of the DCT
 * block.  (The transposition after the first pass means that the right 4x8
 * block during the first pass becomes rows 4-7 during the second pass.)
 */

void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
                           JSAMPARRAY output_buf, JDIMENSION output_col)
{
  ISLOW_MULT_TYPE *quantptr = dct_table;

  int16_t workspace_l[8 * DCTSIZE / 2];
  int16_t workspace_r[8 * DCTSIZE / 2];

  /* Compute IDCT first pass on left 4x8 coefficient block. */

  /* Load DCT coefficients in left 4x8 block. */
  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);

  /* Load quantization table for left 4x8 block. */
  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);

  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
  int16x4_t bitmap = vorr_s16(row7, row6);
  bitmap = vorr_s16(bitmap, row5);
  bitmap = vorr_s16(bitmap, row4);
  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);

  if (bitmap_rows_4567 == 0) {
    bitmap = vorr_s16(bitmap, row3);
    bitmap = vorr_s16(bitmap, row2);
    bitmap = vorr_s16(bitmap, row1);
    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);

    if (left_ac_bitmap == 0) {
      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
      /* Store 4x4 blocks to workspace, transposing in the process. */
      vst4_s16(workspace_l, quadrant);
      vst4_s16(workspace_r, quadrant);
    } else {
      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
                                    quant_row1, quant_row2, quant_row3,
                                    workspace_l, workspace_r);
    }
  } else {
    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
                                   row6, row7, quant_row0, quant_row1,
                                   quant_row2, quant_row3, quant_row4,
                                   quant_row5, quant_row6, quant_row7,
                                   workspace_l, workspace_r);
  }

  /* Compute IDCT first pass on right 4x8 coefficient block. */

  /* Load DCT coefficients in right 4x8 block. */
  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);

  /* Load quantization table for right 4x8 block. */
  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);

  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
  bitmap = vorr_s16(row7, row6);
  bitmap = vorr_s16(bitmap, row5);
  bitmap = vorr_s16(bitmap, row4);
  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
  bitmap = vorr_s16(bitmap, row3);
  bitmap = vorr_s16(bitmap, row2);
  bitmap = vorr_s16(bitmap, row1);
  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);

  /* If this remains non-zero, a "regular" second pass will be performed. */
  int64_t right_ac_dc_bitmap = 1;

  if (right_ac_bitmap == 0) {
    bitmap = vorr_s16(bitmap, row0);
    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);

    if (right_ac_dc_bitmap != 0) {
      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
      /* Store 4x4 blocks to workspace, transposing in the process. */
      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
    }
  } else {
    if (bitmap_rows_4567 == 0) {
      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
                                    quant_row1, quant_row2, quant_row3,
                                    workspace_l + 4 * DCTSIZE / 2,
                                    workspace_r + 4 * DCTSIZE / 2);
    } else {
      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
                                     row6, row7, quant_row0, quant_row1,
                                     quant_row2, quant_row3, quant_row4,
                                     quant_row5, quant_row6, quant_row7,
                                     workspace_l + 4 * DCTSIZE / 2,
                                     workspace_r + 4 * DCTSIZE / 2);
    }
  }

  /* Second pass: compute IDCT on rows in workspace. */

  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
  if (right_ac_dc_bitmap == 0) {
    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
  } else {
    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
  }
}