in Kits/DirectXTex/DirectXTexConvert.cpp [3906:4368]
bool DirectX::_StoreScanlineDither(
void* pDestination,
size_t size,
DXGI_FORMAT format,
XMVECTOR* pSource,
size_t count,
float threshold,
size_t y,
size_t z,
XMVECTOR* pDiffusionErrors) noexcept
{
assert(pDestination != nullptr);
assert(IsValid(format) && !IsTypeless(format) && !IsCompressed(format) && !IsPlanar(format) && !IsPalettized(format));
if (!size || !count)
return false;
const XMVECTOR* __restrict sPtr = pSource;
if (!sPtr)
return false;
assert((reinterpret_cast<uintptr_t>(pSource) & 0xF) == 0);
XMVECTOR ordered[4];
if (pDiffusionErrors)
{
// If pDiffusionErrors != 0, then this function performs error diffusion dithering (aka Floyd-Steinberg dithering)
// To avoid the need for another temporary scanline buffer, we allow this function to overwrite the source buffer in-place
// Given the intended usage in the conversion routines, this is not a problem.
XMVECTOR* ptr = pSource;
const XMVECTOR* err = pDiffusionErrors + 1;
for (size_t i = 0; i < count; ++i)
{
// Add contribution from previous scanline
XMVECTOR v = XMVectorAdd(*ptr, *err++);
*ptr++ = v;
}
// Reset errors for next scanline
memset(pDiffusionErrors, 0, sizeof(XMVECTOR)*(count + 2));
}
else
{
// If pDiffusionErrors == 0, then this function performs ordered dithering
XMVECTOR dither = XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(g_Dither + (z & 3) + ((y & 3) * 8)));
ordered[0] = XMVectorSplatX(dither);
ordered[1] = XMVectorSplatY(dither);
ordered[2] = XMVectorSplatZ(dither);
ordered[3] = XMVectorSplatW(dither);
}
const void* ePtr = static_cast<const uint8_t*>(pDestination) + size;
#ifdef _PREFAST_
*reinterpret_cast<uint8_t*>(pDestination) = 0;
#endif
XMVECTOR vError = XMVectorZero();
switch (static_cast<int>(format))
{
case DXGI_FORMAT_R16G16B16A16_UNORM:
STORE_SCANLINE(XMUSHORTN4, g_Scale16pc, true, true, uint16_t, 0xFFFF, y, false)
case DXGI_FORMAT_R16G16B16A16_UINT:
STORE_SCANLINE(XMUSHORT4, g_Scale16pc, true, false, uint16_t, 0xFFFF, y, false)
case DXGI_FORMAT_R16G16B16A16_SNORM:
STORE_SCANLINE(XMSHORTN4, g_Scale15pc, false, true, int16_t, 0xFFFF, y, false)
case DXGI_FORMAT_R16G16B16A16_SINT:
STORE_SCANLINE(XMSHORT4, g_Scale15pc, false, false, int16_t, 0xFFFF, y, false)
case DXGI_FORMAT_R10G10B10A2_UNORM:
STORE_SCANLINE(XMUDECN4, g_Scale10pc, true, true, uint16_t, 0x3FF, y, false)
case DXGI_FORMAT_R10G10B10A2_UINT:
STORE_SCANLINE(XMUDEC4, g_Scale10pc, true, false, uint16_t, 0x3FF, y, false)
case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
if (size >= sizeof(XMUDEC4))
{
static const XMVECTORF32 Scale = { { { 510.0f, 510.0f, 510.0f, 3.0f } } };
static const XMVECTORF32 Bias = { { { 384.0f, 384.0f, 384.0f, 0.0f } } };
static const XMVECTORF32 MinXR = { { { -0.7529f, -0.7529f, -0.7529f, 0.f } } };
static const XMVECTORF32 MaxXR = { { { 1.2529f, 1.2529f, 1.2529f, 1.0f } } };
XMUDEC4 * __restrict dest = static_cast<XMUDEC4*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorClamp(sPtr[index], MinXR, MaxXR);
v = XMVectorMultiplyAdd(v, Scale, vError);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, Scale);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorAdd(target, Bias);
target = XMVectorClamp(target, g_XMZero, g_Scale10pc);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
dPtr->x = uint16_t(static_cast<uint16_t>(tmp.x) & 0x3FF);
dPtr->y = uint16_t(static_cast<uint16_t>(tmp.y) & 0x3FF);
dPtr->z = uint16_t(static_cast<uint16_t>(tmp.z) & 0x3FF);
dPtr->w = static_cast<uint16_t>(tmp.w);
}
return true;
}
return false;
case DXGI_FORMAT_R8G8B8A8_UNORM:
case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
STORE_SCANLINE(XMUBYTEN4, g_Scale8pc, true, true, uint8_t, 0xFF, y, false)
case DXGI_FORMAT_R8G8B8A8_UINT:
STORE_SCANLINE(XMUBYTE4, g_Scale8pc, true, false, uint8_t, 0xFF, y, false)
case DXGI_FORMAT_R8G8B8A8_SNORM:
STORE_SCANLINE(XMBYTEN4, g_Scale7pc, false, true, int8_t, 0xFF, y, false)
case DXGI_FORMAT_R8G8B8A8_SINT:
STORE_SCANLINE(XMBYTE4, g_Scale7pc, false, false, int8_t, 0xFF, y, false)
case DXGI_FORMAT_R16G16_UNORM:
STORE_SCANLINE2(XMUSHORTN2, g_Scale16pc, true, true, uint16_t, 0xFFFF, y)
case DXGI_FORMAT_R16G16_UINT:
STORE_SCANLINE2(XMUSHORT2, g_Scale16pc, true, false, uint16_t, 0xFFFF, y)
case DXGI_FORMAT_R16G16_SNORM:
STORE_SCANLINE2(XMSHORTN2, g_Scale15pc, false, true, int16_t, 0xFFFF, y)
case DXGI_FORMAT_R16G16_SINT:
STORE_SCANLINE2(XMSHORT2, g_Scale15pc, false, false, int16_t, 0xFFFF, y)
case DXGI_FORMAT_D24_UNORM_S8_UINT:
if (size >= sizeof(uint32_t))
{
static const XMVECTORF32 Clamp = { { { 1.f, 255.f, 0.f, 0.f } } };
static const XMVECTORF32 Scale = { { { 16777215.f, 1.f, 0.f, 0.f } } };
static const XMVECTORF32 Scale2 = { { { 16777215.f, 255.f, 0.f, 0.f } } };
uint32_t * __restrict dest = static_cast<uint32_t*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorClamp(sPtr[index], g_XMZero, Clamp);
v = XMVectorAdd(v, vError);
v = XMVectorMultiply(v, Scale);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, Scale);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorClamp(target, g_XMZero, Scale2);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
*dPtr = (static_cast<uint32_t>(tmp.x) & 0xFFFFFF)
| ((static_cast<uint32_t>(tmp.y) & 0xFF) << 24);
}
return true;
}
return false;
case DXGI_FORMAT_R8G8_UNORM:
STORE_SCANLINE2(XMUBYTEN2, g_Scale8pc, true, true, uint8_t, 0xFF, y)
case DXGI_FORMAT_R8G8_UINT:
STORE_SCANLINE2(XMUBYTE2, g_Scale8pc, true, false, uint8_t, 0xFF, y)
case DXGI_FORMAT_R8G8_SNORM:
STORE_SCANLINE2(XMBYTEN2, g_Scale7pc, false, true, int8_t, 0xFF, y)
case DXGI_FORMAT_R8G8_SINT:
STORE_SCANLINE2(XMBYTE2, g_Scale7pc, false, false, int8_t, 0xFF, y)
case DXGI_FORMAT_D16_UNORM:
case DXGI_FORMAT_R16_UNORM:
STORE_SCANLINE1(uint16_t, g_Scale16pc, true, true, 0xFFFF, y, false)
case DXGI_FORMAT_R16_UINT:
STORE_SCANLINE1(uint16_t, g_Scale16pc, true, false, 0xFFFF, y, false)
case DXGI_FORMAT_R16_SNORM:
STORE_SCANLINE1(int16_t, g_Scale15pc, false, true, 0xFFFF, y, false)
case DXGI_FORMAT_R16_SINT:
STORE_SCANLINE1(int16_t, g_Scale15pc, false, false, 0xFFFF, y, false)
case DXGI_FORMAT_R8_UNORM:
STORE_SCANLINE1(uint8_t, g_Scale8pc, true, true, 0xFF, y, false)
case DXGI_FORMAT_R8_UINT:
STORE_SCANLINE1(uint8_t, g_Scale8pc, true, false, 0xFF, y, false)
case DXGI_FORMAT_R8_SNORM:
STORE_SCANLINE1(int8_t, g_Scale7pc, false, true, 0xFF, y, false)
case DXGI_FORMAT_R8_SINT:
STORE_SCANLINE1(int8_t, g_Scale7pc, false, false, 0xFF, y, false)
case DXGI_FORMAT_A8_UNORM:
STORE_SCANLINE1(uint8_t, g_Scale8pc, true, true, 0xFF, y, true)
case DXGI_FORMAT_B5G6R5_UNORM:
if (size >= sizeof(XMU565))
{
XMU565 * __restrict dest = static_cast<XMU565*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>(sPtr[index]);
v = XMVectorSaturate(v);
v = XMVectorAdd(v, vError);
v = XMVectorMultiply(v, g_Scale565pc);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, g_Scale565pc);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorClamp(target, g_XMZero, g_Scale565pc);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
dPtr->x = uint16_t(static_cast<uint16_t>(tmp.x) & 0x1F);
dPtr->y = uint16_t(static_cast<uint16_t>(tmp.y) & 0x3F);
dPtr->z = uint16_t(static_cast<uint16_t>(tmp.z) & 0x1F);
}
return true;
}
return false;
case DXGI_FORMAT_B5G5R5A1_UNORM:
if (size >= sizeof(XMU555))
{
XMU555 * __restrict dest = static_cast<XMU555*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>(sPtr[index]);
v = XMVectorSaturate(v);
v = XMVectorAdd(v, vError);
v = XMVectorMultiply(v, g_Scale5551pc);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, g_Scale5551pc);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorClamp(target, g_XMZero, g_Scale5551pc);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
dPtr->x = uint16_t(static_cast<uint16_t>(tmp.x) & 0x1F);
dPtr->y = uint16_t(static_cast<uint16_t>(tmp.y) & 0x1F);
dPtr->z = uint16_t(static_cast<uint16_t>(tmp.z) & 0x1F);
dPtr->w = (XMVectorGetW(target) > threshold) ? 1u : 0u;
}
return true;
}
return false;
case DXGI_FORMAT_B8G8R8A8_UNORM:
case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
STORE_SCANLINE(XMUBYTEN4, g_Scale8pc, true, true, uint8_t, 0xFF, y, true)
case DXGI_FORMAT_B8G8R8X8_UNORM:
case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
if (size >= sizeof(XMUBYTEN4))
{
XMUBYTEN4 * __restrict dest = static_cast<XMUBYTEN4*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorSwizzle<2, 1, 0, 3>(sPtr[index]);
v = XMVectorSaturate(v);
v = XMVectorAdd(v, vError);
v = XMVectorMultiply(v, g_Scale8pc);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, g_Scale8pc);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorClamp(target, g_XMZero, g_Scale8pc);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
dPtr->x = uint8_t(static_cast<uint8_t>(tmp.x) & 0xFF);
dPtr->y = uint8_t(static_cast<uint8_t>(tmp.y) & 0xFF);
dPtr->z = uint8_t(static_cast<uint8_t>(tmp.z) & 0xFF);
dPtr->w = 0;
}
return true;
}
return false;
case DXGI_FORMAT_B4G4R4A4_UNORM:
STORE_SCANLINE(XMUNIBBLE4, g_Scale4pc, true, true, uint8_t, 0xF, y, true)
case XBOX_DXGI_FORMAT_R10G10B10_SNORM_A2_UNORM:
STORE_SCANLINE(XMXDECN4, g_Scale9pc, false, true, uint16_t, 0x3FF, y, false)
case XBOX_DXGI_FORMAT_R4G4_UNORM:
if (size >= sizeof(uint8_t))
{
uint8_t * __restrict dest = static_cast<uint8_t*>(pDestination);
for (size_t i = 0; i < count; ++i)
{
auto index = static_cast<ptrdiff_t>((y & 1) ? (count - i - 1) : i);
ptrdiff_t delta = (y & 1) ? -2 : 0;
XMVECTOR v = XMVectorSaturate(sPtr[index]);
v = XMVectorAdd(v, vError);
v = XMVectorMultiply(v, g_Scale4pc);
XMVECTOR target;
if (pDiffusionErrors)
{
target = XMVectorRound(v);
vError = XMVectorSubtract(v, target);
vError = XMVectorDivide(vError, g_Scale4pc);
// Distribute error to next scanline and next pixel
pDiffusionErrors[index - delta] = XMVectorMultiplyAdd(g_ErrorWeight3, vError, pDiffusionErrors[index - delta]);
pDiffusionErrors[index + 1] = XMVectorMultiplyAdd(g_ErrorWeight5, vError, pDiffusionErrors[index + 1]);
pDiffusionErrors[index + 2 + delta] = XMVectorMultiplyAdd(g_ErrorWeight1, vError, pDiffusionErrors[index + 2 + delta]);
vError = XMVectorMultiply(vError, g_ErrorWeight7);
}
else
{
// Applied ordered dither
target = XMVectorAdd(v, ordered[index & 3]);
target = XMVectorRound(target);
}
target = XMVectorClamp(target, g_XMZero, g_Scale4pc);
XMFLOAT4A tmp;
XMStoreFloat4A(&tmp, target);
auto dPtr = &dest[index];
if (dPtr >= ePtr) break;
*dPtr = static_cast<uint8_t>((unsigned(tmp.x) & 0xF) | ((unsigned(tmp.y) & 0xF) << 4));
}
return true;
}
return false;
default:
return _StoreScanline(pDestination, size, format, pSource, count, threshold);
}
}