in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [1637:2139]
int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
{
Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0");
Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");
BYTE *pSrc = bytes;
WCHAR *pTarget = chars;
BYTE *pEnd = pSrc + byteCount;
WCHAR *pAllocatedBufferEnd = pTarget + charCount;
int ch = 0;
DecoderFallbackBuffer *fallback = nullptr;
for (;;)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
break;
}
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
if (ch == 0) {
// no pending bits
goto ReadChar;
}
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & 0xC0) != 0x80) {
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0) {
// Not at last byte yet
Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
"[UTF8Encoding.GetChars]Invariant volation");
if ((ch & SupplimentarySeq) != 0) {
// Its a 4-byte supplimentary sequence
if ((ch & (FinalByte >> 6)) != 0) {
// this is 3rd byte of 4 byte sequence - nothing to do
continue;
}
// 2nd byte of 4 bytes
// check for non-shortest form of surrogate and the valid surrogate
// range 0x000000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
goto InvalidByteSequence;
}
}
else {
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// surrogate in shortest form?
// Might be possible to get rid of this? Already did non-shortest check for 4-byte sequence when reading 2nd byte?
if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
// let the range check for the second char throw the exception
if (pTarget < pAllocatedBufferEnd) {
*pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
(SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))));
pTarget++;
ch = (ch & 0x3FF) +
(int)(CharUnicodeInfo::LOW_SURROGATE_START);
}
}
goto EncodeChar;
InvalidByteSequence:
// this code fragment should be close to the gotos referencing it
// Have to do fallback for invalid bytes
if (fallback == nullptr)
{
fallback = decoderFallback->CreateFallbackBuffer();
fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
}
// That'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
{
// Ran out of buffer space
// Need to throw an exception?
Contract::Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
fallback->InternalReset();
ThrowCharsOverflow(pTarget == chars);
ch = 0;
break;
}
Contract::Assert(pSrc >= bytes,
"[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F) {
// If its > 0x7F, its start of a new multi-byte sequence
// bit 6 has to be non-zero
if ((ch & 0x40) == 0) {
goto InvalidByteSequence;
}
// start a new long code
if ((ch & 0x20) != 0) {
if ((ch & 0x10) != 0) {
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04) {
ch |= 0xf0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
}
else {
// 3 byte encoding
ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
ch |= 0xc0;
goto InvalidByteSequence;
}
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
// write the pending character
if (pTarget >= pAllocatedBufferEnd)
{
// Fix chars so we make sure to throw if we didn't output anything
ch &= 0x1fffff;
if (ch > 0x7f)
{
if (ch > 0x7ff)
{
if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
ch <= CharUnicodeInfo::LOW_SURROGATE_END)
{
pSrc--; // It was 4 bytes
pTarget--; // 1 was stored already, but we can't remember 1/2, so back up
}
else if (ch > 0xffff)
{
pSrc--; // It was 4 bytes, nothing was stored
}
pSrc--; // It was at least 3 bytes
}
pSrc--; // It was at least 2 bytes
}
pSrc--;
// Throw that we don't have enough room (pSrc could be < chars if we had started to process
// a 4 byte sequence alredy)
Contract::Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
ThrowCharsOverflow(pTarget == chars);
// Don't store ch in decoder, we already backed up to its start
ch = 0;
// Didn't throw, just use this buffer size.
break;
}
*pTarget = (WCHAR)ch;
pTarget++;
#ifdef FASTLOOP
int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
int availableBytes = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
// Test for availableChars is done because pStop would be <= pTarget.
if (availableBytes <= 13) {
// we may need as many as 1 character per byte
if (availableChars < availableBytes) {
// not enough output room. no pending bits at this point
ch = 0;
continue;
}
// try to get over the remainder of the ascii characters fast though
BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
*pTarget = (WCHAR)ch;
pTarget++;
}
// we are done
ch = 0;
break;
}
// we may need as many as 1 character per byte, so reduce the byte count if necessary.
// If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
if (availableChars < availableBytes) {
availableBytes = availableChars;
}
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
WCHAR *pStop = pTarget + availableBytes - 7;
while (pTarget < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (WCHAR)ch;
pTarget++;
// get pSrc to be 2-byte aligned
if ((((size_t)pSrc) & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
*pTarget = (WCHAR)ch;
pTarget++;
}
// get pSrc to be 4-byte aligned
if ((((size_t)pSrc) & 0x2) != 0) {
ch = *(USHORT*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (WCHAR)((ch >> 8) & 0x7F);
pSrc += 2;
*(pTarget + 1) = (WCHAR)(ch & 0x7F);
pTarget += 2;
#else // BIGENDIAN
*pTarget = (WCHAR)(ch & 0x7F);
pSrc += 2;
*(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
pTarget += 2;
#endif // BIGENDIAN
}
// Run 8 characters at a time!
while (pTarget < pStop) {
ch = *(int*)pSrc;
int chb = *(int*)(pSrc + 4);
if (((ch | chb) & (int)0x80808080) != 0) {
goto LongCodeWithMask32;
}
// Unfortunately, this is endianess sensitive
#if BIGENDIAN
*pTarget = (WCHAR)((ch >> 24) & 0x7F);
*(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F);
*(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F);
*(pTarget + 3) = (WCHAR)(ch & 0x7F);
pSrc += 8;
*(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F);
*(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F);
*(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F);
*(pTarget + 7) = (WCHAR)(chb & 0x7F);
pTarget += 8;
#else // BIGENDIAN
*pTarget = (WCHAR)(ch & 0x7F);
*(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
*(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F);
*(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F);
pSrc += 8;
*(pTarget + 4) = (WCHAR)(chb & 0x7F);
*(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F);
*(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F);
*(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F);
pTarget += 8;
#endif // BIGENDIAN
}
break;
#if BIGENDIAN
LongCodeWithMask32 :
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
LongCodeWithMask16:
ch = (int)(((uint)ch) >> 8);
#else // BIGENDIAN
LongCodeWithMask32:
LongCodeWithMask16:
ch &= 0xFF;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
*pTarget = (WCHAR)ch;
pTarget++;
continue;
}
LongCode:
int chc = *pSrc;
pSrc++;
if (
// bit 6 has to be zero
(ch & 0x40) == 0 ||
// we are expecting to see trailing bytes like 10vvvvvv
(chc & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc &= 0x3F;
// start a new long code
if ((ch & 0x20) != 0) {
// fold the first two bytes together
chc |= (ch & 0x0F) << 6;
if ((ch & 0x10) != 0) {
// 4 byte encoding - surrogate
ch = *pSrc;
if (
// check that bit 4 is zero, the non-shortest form of surrogate
// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
!InRange(chc >> 4, 0x01, 0x10) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc = (chc << 6) | (ch & 0x3F);
ch = *(pSrc + 1);
// we are expecting to see trailing bytes like 10vvvvvv
if ((ch & 0xC0) != 0x80) {
goto BadLongCode;
}
pSrc += 2;
ch = (chc << 6) | (ch & 0x3F);
*pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
(SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)));
pTarget++;
ch = (ch & 0x3FF) +
(SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);
// extra byte, we're already planning 2 chars for 2 of these bytes,
// but the big loop is testing the target against pStop, so we need
// to subtract 2 more or we risk overrunning the input. Subtract
// one here and one below.
pStop--;
}
else {
// 3 byte encoding
ch = *pSrc;
if (
// check for non-shortest form of 3 byte seq
(chc & (0x1F << 5)) == 0 ||
// Can't have surrogates here.
(chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
pSrc++;
ch = (chc << 6) | (ch & 0x3F);
// extra byte, we're only expecting 1 char for each of these 3 bytes,
// but the loop is testing the target (not source) against pStop, so
// we need to subtract 2 more or we risk overrunning the input.
// Subtract 1 here and one more below
pStop--;
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
goto BadLongCode;
}
ch = (ch << 6) | chc;
}
*pTarget = (WCHAR)ch;
pTarget++;
// extra byte, we're only expecting 1 char for each of these 2 bytes,
// but the loop is testing the target (not source) against pStop.
// subtract an extra count from pStop so that we don't overrun the input.
pStop--;
}
#endif // FASTLOOP
Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");
// no pending bits at this point
ch = 0;
continue;
BadLongCode:
pSrc -= 2;
ch = 0;
continue;
}
if (ch != 0)
{
// Have to do fallback for invalid bytes
if (fallback == nullptr)
{
fallback = decoderFallback->CreateFallbackBuffer();
fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
}
// This'll back us up the appropriate # of bytes if we didn't get anywhere
if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
{
Contract::Assert(pSrc >= bytes || pTarget == chars,
"[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");
// Ran out of buffer space
// Need to throw an exception?
fallback->InternalReset();
ThrowCharsOverflow(pTarget == chars);
}
Contract::Assert(pSrc >= bytes,
"[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
ch = 0;
}
// Shouldn't have anything in fallback buffer for GetChars
// (don't have to check m_throwOnOverflow for chars)
Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
"[UTF8Encoding.GetChars]Expected empty fallback buffer at end");
InternalDelete(fallback);
return PtrDiff(pTarget, chars);
}