in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [2141:2529]
int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
{
Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");
// For fallback we may need a fallback buffer.
// We wait to initialize it though in case we don't have any broken input unicode
EncoderFallbackBuffer* fallbackBuffer = nullptr;
WCHAR *pSrc = chars;
BYTE *pTarget = bytes;
WCHAR *pEnd = pSrc + charCount;
BYTE *pAllocatedBufferEnd = pTarget + byteCount;
int ch = 0;
// assume that JIT will enregister pSrc, pTarget and ch
for (;;) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
if (ch == 0) {
// Check if there's anything left to get out of the fallback buffer
ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
if (ch > 0) {
goto ProcessChar;
}
}
else {
// Case of leftover surrogates in the fallback buffer
if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
int cha = ch;
ch = fallbackBuffer->InternalGetNextChar();
if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
goto EncodeChar;
}
else if (ch > 0){
goto ProcessChar;
}
else {
break;
}
}
}
// attempt to encode the partial surrogate (will fail or ignore)
if (ch > 0)
goto EncodeChar;
// We're done
break;
}
if (ch > 0) {
// We have a high surrogate left over from a previous loop.
Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int cha = *pSrc;
// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
// if (IsLowSurrogate(cha)) {
if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
ch = cha + (ch << 10) +
(0x10000
- CharUnicodeInfo::LOW_SURROGATE_START
- (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
pSrc++;
}
// else ch is still high surrogate and encoding will fail
// attempt to encode the surrogate or partial surrogate
goto EncodeChar;
}
// If we've used a fallback, then we have to check for it
if (fallbackBuffer != nullptr)
{
ch = fallbackBuffer->InternalGetNextChar();
if (ch > 0) goto ProcessChar;
}
// read next char. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
ch = *pSrc;
pSrc++;
ProcessChar:
if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
continue;
}
// either good char or partial surrogate
EncodeChar:
// throw exception on partial surrogate if necessary
if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
{
// Lone surrogates aren't allowed, we have to do fallback for them
// Have to make a fallback buffer if we don't have one
if (fallbackBuffer == nullptr)
{
// wait on fallbacks if we can
// For fallback we may need a fallback buffer
fallbackBuffer = encoderFallback->CreateFallbackBuffer();
// Set our internal fallback interesting things.
fallbackBuffer->InternalInitialize(chars, pEnd, true);
}
// Do our fallback. Actually we already know its a mixed up surrogate,
// so the ref pSrc isn't gonna do anything.
fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
// Ignore it if we don't throw
ch = 0;
continue;
}
// Count bytes needed
int bytesNeeded = 1;
if (ch > 0x7F) {
if (ch > 0x7FF) {
if (ch > 0xFFFF) {
bytesNeeded++; // 4 bytes (surrogate pair)
}
bytesNeeded++; // 3 bytes (800-FFFF)
}
bytesNeeded++; // 2 bytes (80-7FF)
}
if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
// Left over surrogate from last time will cause pSrc == chars, so we'll throw
if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
{
fallbackBuffer->MovePrevious(); // Didn't use this fallback char
if (ch > 0xFFFF)
fallbackBuffer->MovePrevious(); // Was surrogate, didn't use 2nd part either
}
else
{
pSrc--; // Didn't use this char
if (ch > 0xFFFF)
pSrc--; // Was surrogate, didn't use 2nd part either
}
Contract::Assert(pSrc >= chars || pTarget == bytes,
"[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
ThrowBytesOverflow(pTarget == bytes); // Throw if we must
ch = 0; // Nothing left over (we backed up to start of pair if supplimentary)
break;
}
if (ch <= 0x7F) {
*pTarget = (BYTE)ch;
}
else {
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int chb;
if (ch <= 0x7FF) {
// 2 BYTE encoding
chb = (BYTE)(0xC0 | (ch >> 6));
}
else
{
if (ch <= 0xFFFF) {
chb = (BYTE)(0xE0 | (ch >> 12));
}
else
{
*pTarget = (BYTE)(0xF0 | (ch >> 18));
pTarget++;
chb = 0x80 | ((ch >> 12) & 0x3F);
}
*pTarget = (BYTE)chb;
pTarget++;
chb = 0x80 | ((ch >> 6) & 0x3F);
}
*pTarget = (BYTE)chb;
pTarget++;
*pTarget = (BYTE)0x80 | (ch & 0x3F);
}
pTarget++;
#ifdef FASTLOOP
// If still have fallback don't do fast loop
if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
goto ProcessChar;
int availableChars = PtrDiff(pEnd, pSrc);
int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);
// don't fall into the fast decoding loop if we don't have enough characters
// Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
if (availableChars <= 13) {
// we are hoping for 1 BYTE per char
if (availableBytes < availableChars) {
// not enough output room. no pending bits at this point
ch = 0;
continue;
}
// try to get over the remainder of the ascii characters fast though
WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
// Not ASCII, need more than 1 BYTE per char
if (ch > 0x7F)
goto ProcessChar;
*pTarget = (BYTE)ch;
pTarget++;
}
// we are done, let ch be 0 to clear encoder
ch = 0;
break;
}
// we need at least 1 BYTE per character, but Convert might allow us to convert
// only part of the input, so try as much as we can. Reduce charCount if necessary
if (availableBytes < availableChars)
{
availableChars = availableBytes;
}
// FASTLOOP:
// - optimistic range checks
// - fallbacks to the slow loop for all special cases, exception throwing, etc.
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
// If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
WCHAR *pStop = pSrc + availableChars - 5;
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (BYTE)ch;
pTarget++;
// get pSrc aligned
if (((size_t)pSrc & 0x2) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (BYTE)ch;
pTarget++;
}
// Run 4 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chc = *(int*)(pSrc + 2);
if (((ch | chc) & (int)0xFF80FF80) != 0) {
goto LongCodeWithMask;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (BYTE)(ch >> 16);
*(pTarget + 1) = (BYTE)ch;
pSrc += 4;
*(pTarget + 2) = (BYTE)(chc >> 16);
*(pTarget + 3) = (BYTE)chc;
pTarget += 4;
#else // BIGENDIAN
*pTarget = (BYTE)ch;
*(pTarget + 1) = (BYTE)(ch >> 16);
pSrc += 4;
*(pTarget + 2) = (BYTE)chc;
*(pTarget + 3) = (BYTE)(chc >> 16);
pTarget += 4;
#endif // BIGENDIAN
}
continue;
LongCodeWithMask:
#if BIGENDIAN
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
ch = (WCHAR)ch;
#endif // BIGENDIAN
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (BYTE)ch;
pTarget++;
continue;
LongCode:
// use separate helper variables for slow and fast loop so that the jit optimizations
// won't get confused about the variable lifetimes
int chd;
if (ch <= 0x7FF) {
// 2 BYTE encoding
chd = 0xC0 | (ch >> 6);
}
else {
if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
// 3 BYTE encoding
chd = 0xE0 | (ch >> 12);
}
else
{
// 4 BYTE encoding - high surrogate + low surrogate
if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
// low without high -> bad, try again in slow loop
pSrc -= 1;
break;
}
chd = *pSrc;
pSrc++;
// if (!IsLowSurrogate(chd)) {
if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
// high not followed by low -> bad, try again in slow loop
pSrc -= 2;
break;
}
ch = chd + (ch << 10) +
(0x10000
- CharUnicodeInfo::LOW_SURROGATE_START
- (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
*pTarget = (BYTE)(0xF0 | (ch >> 18));
// pStop - this BYTE is compensated by the second surrogate character
// 2 input chars require 4 output bytes. 2 have been anticipated already
// and 2 more will be accounted for by the 2 pStop-- calls below.
pTarget++;
chd = 0x80 | ((ch >> 12) & 0x3F);
}
*pTarget = (BYTE)chd;
pStop--; // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
pTarget++;
chd = 0x80 | ((ch >> 6) & 0x3F);
}
*pTarget = (BYTE)chd;
pStop--; // 2 BYTE sequence for 1 char so need pStop--.
pTarget++;
*pTarget = (BYTE)(0x80 | (ch & 0x3F));
// pStop - this BYTE is already included
pTarget++;
}
Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");
#endif // FASTLOOP
// no pending char at this point
ch = 0;
}
InternalDelete(fallbackBuffer);
return (int)(pTarget - bytes);
}