void jsimd_idct_ifast_altivec()

in simd/powerpc/jidctfst-altivec.c [112:255]


void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
                              JSAMPARRAY output_buf, JDIMENSION output_col)
{
  short *dct_table = (short *)dct_table_;
  int *outptr;

  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    z5, z10, z10s, z11, z12s, z13,
    out0, out1, out2, out3, out4, out5, out6, out7;
  __vector signed char outb;

  /* Constants */
  __vector short pw_zero = { __8X(0) },
    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
    pw_MF1613 = { __8X((short)((unsigned short)(-F_1_613) << CONST_SHIFT)) },
    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
  __vector unsigned short
    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
    pass1_bits3 = { __8X(PASS1_BITS + 3) };
  __vector signed char pb_centerjsamp = { __16X((signed char)CENTERJSAMPLE) };

  /* Pass 1: process columns */

  col0 = vec_ld(0, coef_block);
  col1 = vec_ld(16, coef_block);
  col2 = vec_ld(32, coef_block);
  col3 = vec_ld(48, coef_block);
  col4 = vec_ld(64, coef_block);
  col5 = vec_ld(80, coef_block);
  col6 = vec_ld(96, coef_block);
  col7 = vec_ld(112, coef_block);

  tmp1 = vec_or(col1, col2);
  tmp2 = vec_or(col3, col4);
  tmp1 = vec_or(tmp1, tmp2);
  tmp3 = vec_or(col5, col6);
  tmp3 = vec_or(tmp3, col7);
  tmp1 = vec_or(tmp1, tmp3);

  quant0 = vec_ld(0, dct_table);
  col0 = vec_mladd(col0, quant0, pw_zero);

  if (vec_all_eq(tmp1, pw_zero)) {
    /* AC terms all zero */

    row0 = vec_splat(col0, 0);
    row1 = vec_splat(col0, 1);
    row2 = vec_splat(col0, 2);
    row3 = vec_splat(col0, 3);
    row4 = vec_splat(col0, 4);
    row5 = vec_splat(col0, 5);
    row6 = vec_splat(col0, 6);
    row7 = vec_splat(col0, 7);

  } else {

    quant1 = vec_ld(16, dct_table);
    quant2 = vec_ld(32, dct_table);
    quant3 = vec_ld(48, dct_table);
    quant4 = vec_ld(64, dct_table);
    quant5 = vec_ld(80, dct_table);
    quant6 = vec_ld(96, dct_table);
    quant7 = vec_ld(112, dct_table);

    col1 = vec_mladd(col1, quant1, pw_zero);
    col2 = vec_mladd(col2, quant2, pw_zero);
    col3 = vec_mladd(col3, quant3, pw_zero);
    col4 = vec_mladd(col4, quant4, pw_zero);
    col5 = vec_mladd(col5, quant5, pw_zero);
    col6 = vec_mladd(col6, quant6, pw_zero);
    col7 = vec_mladd(col7, quant7, pw_zero);

    DO_IDCT(col);

    TRANSPOSE(out, row);
  }

  /* Pass 2: process rows */

  DO_IDCT(row);

  out0 = vec_sra(out0, pass1_bits3);
  out1 = vec_sra(out1, pass1_bits3);
  out2 = vec_sra(out2, pass1_bits3);
  out3 = vec_sra(out3, pass1_bits3);
  out4 = vec_sra(out4, pass1_bits3);
  out5 = vec_sra(out5, pass1_bits3);
  out6 = vec_sra(out6, pass1_bits3);
  out7 = vec_sra(out7, pass1_bits3);

  TRANSPOSE(out, col);

  outb = vec_packs(col0, col0);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[0] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col1, col1);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[1] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col2, col2);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[2] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col3, col3);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[3] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col4, col4);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[4] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col5, col5);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[5] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col6, col6);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[6] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);

  outb = vec_packs(col7, col7);
  outb = vec_add(outb, pb_centerjsamp);
  outptr = (int *)(output_buf[7] + output_col);
  vec_ste((__vector int)outb, 0, outptr);
  vec_ste((__vector int)outb, 4, outptr);
}