in simd/powerpc/jquanti-altivec.c [116:250]
void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
DCTELEM *workspace)
{
__vector short row0, row1, row2, row3, row4, row5, row6, row7,
row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
__vector unsigned int tmpe, tmpo;
/* Constants */
__vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
#ifdef __BIG_ENDIAN__
__vector unsigned char shift_pack_index =
{ 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 };
#else
__vector unsigned char shift_pack_index =
{ 2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
#endif
row0 = vec_ld(0, workspace);
row1 = vec_ld(16, workspace);
row2 = vec_ld(32, workspace);
row3 = vec_ld(48, workspace);
row4 = vec_ld(64, workspace);
row5 = vec_ld(80, workspace);
row6 = vec_ld(96, workspace);
row7 = vec_ld(112, workspace);
/* Branch-less absolute value */
row0s = vec_sra(row0, pw_word_bit_m1);
row1s = vec_sra(row1, pw_word_bit_m1);
row2s = vec_sra(row2, pw_word_bit_m1);
row3s = vec_sra(row3, pw_word_bit_m1);
row4s = vec_sra(row4, pw_word_bit_m1);
row5s = vec_sra(row5, pw_word_bit_m1);
row6s = vec_sra(row6, pw_word_bit_m1);
row7s = vec_sra(row7, pw_word_bit_m1);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
corr0 = vec_ld(DCTSIZE2 * 2, divisors);
corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
row0 = vec_add(row0, corr0);
row1 = vec_add(row1, corr1);
row2 = vec_add(row2, corr2);
row3 = vec_add(row3, corr3);
row4 = vec_add(row4, corr4);
row5 = vec_add(row5, corr5);
row6 = vec_add(row6, corr6);
row7 = vec_add(row7, corr7);
recip0 = vec_ld(0, divisors);
recip1 = vec_ld(16, divisors);
recip2 = vec_ld(32, divisors);
recip3 = vec_ld(48, divisors);
recip4 = vec_ld(64, divisors);
recip5 = vec_ld(80, divisors);
recip6 = vec_ld(96, divisors);
recip7 = vec_ld(112, divisors);
MULTIPLY(row0, recip0, row0);
MULTIPLY(row1, recip1, row1);
MULTIPLY(row2, recip2, row2);
MULTIPLY(row3, recip3, row3);
MULTIPLY(row4, recip4, row4);
MULTIPLY(row5, recip5, row5);
MULTIPLY(row6, recip6, row6);
MULTIPLY(row7, recip7, row7);
scale0 = vec_ld(DCTSIZE2 * 4, divisors);
scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
MULTIPLY(row0, scale0, row0);
MULTIPLY(row1, scale1, row1);
MULTIPLY(row2, scale2, row2);
MULTIPLY(row3, scale3, row3);
MULTIPLY(row4, scale4, row4);
MULTIPLY(row5, scale5, row5);
MULTIPLY(row6, scale6, row6);
MULTIPLY(row7, scale7, row7);
row0 = vec_xor(row0, row0s);
row1 = vec_xor(row1, row1s);
row2 = vec_xor(row2, row2s);
row3 = vec_xor(row3, row3s);
row4 = vec_xor(row4, row4s);
row5 = vec_xor(row5, row5s);
row6 = vec_xor(row6, row6s);
row7 = vec_xor(row7, row7s);
row0 = vec_sub(row0, row0s);
row1 = vec_sub(row1, row1s);
row2 = vec_sub(row2, row2s);
row3 = vec_sub(row3, row3s);
row4 = vec_sub(row4, row4s);
row5 = vec_sub(row5, row5s);
row6 = vec_sub(row6, row6s);
row7 = vec_sub(row7, row7s);
vec_st(row0, 0, coef_block);
vec_st(row1, 16, coef_block);
vec_st(row2, 32, coef_block);
vec_st(row3, 48, coef_block);
vec_st(row4, 64, coef_block);
vec_st(row5, 80, coef_block);
vec_st(row6, 96, coef_block);
vec_st(row7, 112, coef_block);
}