in simd/arm/jidctint-neon.c [96:322]
static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t row4,
int16x4_t row5,
int16x4_t row6,
int16x4_t row7,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16x4_t quant_row4,
int16x4_t quant_row5,
int16x4_t quant_row6,
int16x4_t quant_row7,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
int16x4_t row1,
int16x4_t row2,
int16x4_t row3,
int16x4_t quant_row0,
int16x4_t quant_row1,
int16x4_t quant_row2,
int16x4_t quant_row3,
int16_t *workspace_1,
int16_t *workspace_2);
static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
JSAMPARRAY output_buf,
JDIMENSION output_col,
unsigned buf_offset);
/* Perform dequantization and inverse DCT on one block of coefficients. For
* reference, the C implementation (jpeg_idct_slow()) can be found in
* jidctint.c.
*
* Optimization techniques used for fast data access:
*
* In each pass, the inverse DCT is computed for the left and right 4x8 halves
* of the DCT block. This avoids spilling due to register pressure, and the
* increased granularity allows for an optimized calculation depending on the
* values of the DCT coefficients. Between passes, intermediate data is stored
* in 4x8 workspace buffers.
*
* Transposing the 8x8 DCT block after each pass can be achieved by transposing
* each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
* diagram below.) Swapping quadrants is cheap, since the second pass can just
* swap the workspace buffer pointers.
*
* +-------+-------+ +-------+-------+
* | | | | | |
* | 0 | 1 | | 0 | 2 |
* | | | transpose | | |
* +-------+-------+ ------> +-------+-------+
* | | | | | |
* | 2 | 3 | | 1 | 3 |
* | | | | | |
* +-------+-------+ +-------+-------+
*
* Optimization techniques used to accelerate the inverse DCT calculation:
*
* In a DCT coefficient block, the coefficients are increasingly likely to be 0
* as you move diagonally from top left to bottom right. If whole rows of
* coefficients are 0, then the inverse DCT calculation can be simplified. On
* the first pass of the inverse DCT, we test for three special cases before
* defaulting to a full "regular" inverse DCT:
*
* 1) Coefficients in rows 4-7 are all zero. In this case, we perform a
* "sparse" simplified inverse DCT on rows 0-3.
* 2) AC coefficients (rows 1-7) are all zero. In this case, the inverse DCT
* result is equal to the dequantized DC coefficients.
* 3) AC and DC coefficients are all zero. In this case, the inverse DCT
* result is all zero. For the left 4x8 half, this is handled identically
* to Case 2 above. For the right 4x8 half, we do no work and signal that
* the "sparse" algorithm is required for the second pass.
*
* In the second pass, only a single special case is tested: whether the AC and
* DC coefficients were all zero in the right 4x8 block during the first pass
* (refer to Case 3 above.) If this is the case, then a "sparse" variant of
* the second pass is performed for both the left and right halves of the DCT
* block. (The transposition after the first pass means that the right 4x8
* block during the first pass becomes rows 4-7 during the second pass.)
*/
void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
JSAMPARRAY output_buf, JDIMENSION output_col)
{
ISLOW_MULT_TYPE *quantptr = dct_table;
int16_t workspace_l[8 * DCTSIZE / 2];
int16_t workspace_r[8 * DCTSIZE / 2];
/* Compute IDCT first pass on left 4x8 coefficient block. */
/* Load DCT coefficients in left 4x8 block. */
int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
/* Load quantization table for left 4x8 block. */
int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
/* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
int16x4_t bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (bitmap_rows_4567 == 0) {
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (left_ac_bitmap == 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l, quadrant);
vst4_s16(workspace_r, quadrant);
} else {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l, workspace_r);
}
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l, workspace_r);
}
/* Compute IDCT first pass on right 4x8 coefficient block. */
/* Load DCT coefficients in right 4x8 block. */
row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
/* Load quantization table for right 4x8 block. */
quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
/* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
bitmap = vorr_s16(row7, row6);
bitmap = vorr_s16(bitmap, row5);
bitmap = vorr_s16(bitmap, row4);
bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
bitmap = vorr_s16(bitmap, row3);
bitmap = vorr_s16(bitmap, row2);
bitmap = vorr_s16(bitmap, row1);
int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
/* If this remains non-zero, a "regular" second pass will be performed. */
int64_t right_ac_dc_bitmap = 1;
if (right_ac_bitmap == 0) {
bitmap = vorr_s16(bitmap, row0);
right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
if (right_ac_dc_bitmap != 0) {
int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
/* Store 4x4 blocks to workspace, transposing in the process. */
vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
}
} else {
if (bitmap_rows_4567 == 0) {
jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
quant_row1, quant_row2, quant_row3,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
} else {
jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
row6, row7, quant_row0, quant_row1,
quant_row2, quant_row3, quant_row4,
quant_row5, quant_row6, quant_row7,
workspace_l + 4 * DCTSIZE / 2,
workspace_r + 4 * DCTSIZE / 2);
}
}
/* Second pass: compute IDCT on rows in workspace. */
/* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
if (right_ac_dc_bitmap == 0) {
jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
} else {
jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
}
}