in d3d/archive/images/d3d11/tessellator.cpp [101:289]
INT32 floatToIDotF( const float& input )
{
// ------------------------------------------------------------------------
// output fixed point format
// 32-bit result:
//
// [sign-extend]i.f
// | |
// MSB(31)...LSB(0)
//
// f fractional part of the number, an unsigned
// value with _fxpFracBitCount bits (defined below)
//
// . implied decimal
//
// i integer part of the number, a 2's complement
// value with _fxpIntBitCount bits (defined below)
//
// [sign-extend] MSB of i conditionally replicated
//
// ------------------------------------------------------------------------
// Define fixed point bit counts
//
// Commenting out C_ASSERT below to minimise #includes:
// C_ASSERT( 2 <= c_uIBits && c_uIBits <= 32 && c_uFBits <= 32 && c_uIBits + c_uFBits <= 32 );
// Define most negative and most positive fixed point values
const INT32 c_iMinResult = (c_bSigned ? INT32( -1 ) << (c_uIBits + c_uFBits - 1) : 0);
const INT32 c_iMaxResult = ~c_iMinResult;
// ------------------------------------------------------------------------
// constant float properties
// ------------------------------------------------------------------------
const UINT8 _fltMantissaBitCount = 23;
const UINT8 _fltExponentBitCount = 8;
const INT32 _fltExponentBias = (INT32( 1 ) << (_fltExponentBitCount - 1)) - 1;
const INT32 _fltHiddenBit = INT32( 1 ) << _fltMantissaBitCount;
const INT32 _fltMantissaMask = _fltHiddenBit - 1;
const INT32 _fltExponentMask = ((INT32( 1 ) << _fltExponentBitCount) - 1) << _fltMantissaBitCount;
const INT32 _fltSignBit = INT32( 1 ) << (_fltExponentBitCount + _fltMantissaBitCount);
// ------------------------------------------------------------------------
// define min and max values as floats (clamp to these bounds)
// ------------------------------------------------------------------------
INT32 _fxpMaxPosValueFloat;
INT32 _fxpMaxNegValueFloat;
if (c_bSigned)
{
// The maximum positive fixed point value is 2^(i-1) - 2^(-f).
// The following constructs the floating point bit pattern for this value,
// as long as i >= 2.
_fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits - 1) <<_fltMantissaBitCount;
const INT32 iShift = _fltMantissaBitCount + 2 - c_uIBits - c_uFBits;
if (iShift >= 0)
{
// assert( iShift < 32 );
#pragma warning( suppress : 4293 )
_fxpMaxPosValueFloat -= INT32( 1 ) << iShift;
}
// The maximum negative fixed point value is -2^(i-1).
// The following constructs the floating point bit pattern for this value,
// as long as i >= 2.
// We need this number without the sign bit
_fxpMaxNegValueFloat = (_fltExponentBias + c_uIBits - 1) << _fltMantissaBitCount;
}
else
{
// The maximum positive fixed point value is 2^(i) - 2^(-f).
// The following constructs the floating point bit pattern for this value,
// as long as i >= 2.
_fxpMaxPosValueFloat = (_fltExponentBias + c_uIBits) <<_fltMantissaBitCount;
const INT32 iShift = _fltMantissaBitCount + 1 - c_uIBits - c_uFBits;
if (iShift >= 0)
{
// assert( iShift < 32 );
#pragma warning( suppress : 4293 )
_fxpMaxPosValueFloat -= INT32( 1 ) << iShift;
}
// The maximum negative fixed point value is 0.
_fxpMaxNegValueFloat = 0;
}
// ------------------------------------------------------------------------
// float -> fixed conversion
// ------------------------------------------------------------------------
// ------------------------------------------------------------------------
// examine input float
// ------------------------------------------------------------------------
INT32 output = *(INT32*)&input;
INT32 unbiasedExponent = ((output & _fltExponentMask) >> _fltMantissaBitCount) - _fltExponentBias;
INT32 isNegative = output & _fltSignBit;
// ------------------------------------------------------------------------
// nan
// ------------------------------------------------------------------------
if (unbiasedExponent == (_fltExponentBias + 1) && (output & _fltMantissaMask))
{
// nan converts to 0
output = 0;
}
// ------------------------------------------------------------------------
// too large positive
// ------------------------------------------------------------------------
else if (!isNegative && output >= _fxpMaxPosValueFloat) // integer compare
{
output = c_iMaxResult;
}
// ------------------------------------------------------------------------
// too large negative
// ------------------------------------------------------------------------
// integer compare
else if (isNegative && (output & ~_fltSignBit) >= _fxpMaxNegValueFloat)
{
output = c_iMinResult;
}
// ------------------------------------------------------------------------
// too small
// ------------------------------------------------------------------------
else if (unbiasedExponent < -c_uFBits - 1)
{
// clamp to 0
output = 0;
}
// ------------------------------------------------------------------------
// within range
// ------------------------------------------------------------------------
else
{
// copy mantissa, add hidden bit
output = (output & _fltMantissaMask) | _fltHiddenBit;
INT32 extraBits = _fltMantissaBitCount - c_uFBits - unbiasedExponent;
if (extraBits >= 0)
{
// 2's complement if negative
if (isNegative)
{
output = ~output + 1;
}
// From the range checks that led here, it is known that
// unbiasedExponent < c_uIBits. So, at most:
// (a) unbiasedExponent == c_uIBits - 1.
//
// From compile validation above, it is known that
// c_uIBits + c_uFBits <= _fltMantissaBitCount + 1).
// So, at minimum:
// (b) _fltMantissaBitCount == _fxtIntBitCount + c_uFBits - 1
//
// Substituting (a) and (b) into extraBits calculation above:
// extraBits >= (_fxtIntBitCount + c_uFBits - 1)
// - c_uFBits - (c_uIBits - 1)
// extraBits >= 0
//
// Thus we only have to worry about shifting right by 0 or more
// bits to get the decimal to the right place, and never have
// to shift left.
INT32 LSB = 1 << extraBits; // last bit being kept
INT32 extraBitsMask = LSB - 1;
INT32 half = LSB >> 1; // round bias
// round to nearest-even at LSB
if ((output & LSB) || (output & extraBitsMask) > half)
{
output += half;
}
// shift off the extra bits (sign extending)
output >>= extraBits;
}
else
{
output <<= -extraBits;
// 2's complement if negative
if (isNegative)
{
output = ~output + 1;
}
}
}
return output;
}