in NYC_Summit18_Developer_Workshop/workspace/IDCT/src/idct.cpp [685:826]
void idctSoft(const int16_t block[64],
const uint16_t q[64],
int16_t outp[64],
bool ignore_dc) {
int32_t intermed[64];
const uint16_t w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
const uint16_t w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
const uint16_t w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
const uint16_t w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
const uint16_t w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
const uint16_t w7 = 565; // 2048*sqrt(2)*cos(7*pi/16)
const uint16_t w1pw7 = w1 + w7;
const uint16_t w1mw7 = w1 - w7;
const uint16_t w2pw6 = w2 + w6;
const uint16_t w2mw6 = w2 - w6;
const uint16_t w3pw5 = w3 + w5;
const uint16_t w3mw5 = w3 - w5;
const uint16_t r2 = 181; // 256/sqrt(2)
// Horizontal 1-D IDCT.
for (int y = 0; y < 8; ++y) {
int y8 = y * 8;
int32_t x0 = (((ignore_dc && y == 0)
? 0 : (block[y8 + 0] * q[y8 + 0]) << 11)) + 128;
int32_t x1 = (block[y8 + 4] * q[y8 + 4]) << 11;
int32_t x2 = block[y8 + 6] * q[y8 + 6];
int32_t x3 = block[y8 + 2] * q[y8 + 2];
int32_t x4 = block[y8 + 1] * q[y8 + 1];
int32_t x5 = block[y8 + 7] * q[y8 + 7];
int32_t x6 = block[y8 + 5] * q[y8 + 5];
int32_t x7 = block[y8 + 3] * q[y8 + 3];
// If all the AC components are zero, then the IDCT is trivial.
if (x1 ==0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0 && x6 == 0 && x7 == 0) {
int32_t dc = (x0 - 128) >> 8; // coefficients[0] << 3
intermed[y8 + 0] = dc;
intermed[y8 + 1] = dc;
intermed[y8 + 2] = dc;
intermed[y8 + 3] = dc;
intermed[y8 + 4] = dc;
intermed[y8 + 5] = dc;
intermed[y8 + 6] = dc;
intermed[y8 + 7] = dc;
continue;
}
// Prescale.
// Stage 1.
int32_t x8 = w7 * (x4 + x5);
x4 = x8 + w1mw7*x4;
x5 = x8 - w1pw7*x5;
x8 = w3 * (x6 + x7);
x6 = x8 - w3mw5*x6;
x7 = x8 - w3pw5*x7;
// Stage 2.
x8 = x0 + x1;
x0 -= x1;
x1 = w6 * (x3 + x2);
x2 = x1 - w2pw6*x2;
x3 = x1 + w2mw6*x3;
x1 = x4 + x6;
x4 -= x6;
x6 = x5 + x7;
x5 -= x7;
// Stage 3.
x7 = x8 + x3;
x8 -= x3;
x3 = x0 + x2;
x0 -= x2;
x2 = (r2*(x4+x5) + 128) >> 8;
x4 = (r2*(x4-x5) + 128) >> 8;
// Stage 4.
intermed[y8+0] = (x7 + x1) >> 8;
intermed[y8+1] = (x3 + x2) >> 8;
intermed[y8+2] = (x0 + x4) >> 8;
intermed[y8+3] = (x8 + x6) >> 8;
intermed[y8+4] = (x8 - x6) >> 8;
intermed[y8+5] = (x0 - x4) >> 8;
intermed[y8+6] = (x3 - x2) >> 8;
intermed[y8+7] = (x7 - x1) >> 8;
}
// Vertical 1-D IDCT.
for (int32_t x = 0; x < 8; ++x) {
// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
// we do not bother to check for the all-zero case.
// Prescale.
int32_t y0 = (intermed[8*0+x] << 8) + 8192;
int32_t y1 = intermed[8*4+x] << 8;
int32_t y2 = intermed[8*6+x];
int32_t y3 = intermed[8*2+x];
int32_t y4 = intermed[8*1+x];
int32_t y5 = intermed[8*7+x];
int32_t y6 = intermed[8*5+x];
int32_t y7 = intermed[8*3+x];
// Stage 1.
int32_t y8 = w7*(y4+y5) + 4;
y4 = (y8 + w1mw7*y4) >> 3;
y5 = (y8 - w1pw7*y5) >> 3;
y8 = w3*(y6+y7) + 4;
y6 = (y8 - w3mw5*y6) >> 3;
y7 = (y8 - w3pw5*y7) >> 3;
// Stage 2.
y8 = y0 + y1;
y0 -= y1;
y1 = w6*(y3+y2) + 4;
y2 = (y1 - w2pw6*y2) >> 3;
y3 = (y1 + w2mw6*y3) >> 3;
y1 = y4 + y6;
y4 -= y6;
y6 = y5 + y7;
y5 -= y7;
// Stage 3.
y7 = y8 + y3;
y8 -= y3;
y3 = y0 + y2;
y0 -= y2;
y2 = (r2*(y4+y5) + 128) >> 8;
y4 = (r2*(y4-y5) + 128) >> 8;
// Stage 4.
outp[8*0+x] = (y7 + y1) >> 11;
outp[8*1+x] = (y3 + y2) >> 11;
outp[8*2+x] = (y0 + y4) >> 11;
outp[8*3+x] = (y8 + y6) >> 11;
outp[8*4+x] = (y8 - y6) >> 11;
outp[8*5+x] = (y0 - y4) >> 11;
outp[8*6+x] = (y3 - y2) >> 11;
outp[8*7+x] = (y7 - y1) >> 11;
}
}