in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [1269:1635]
int GetCharCount(BYTE* bytes, int count)
{
Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetCharCount]bytes!=nullptr");
Contract::Assert(count >= 0, "[UTF8Encoding.GetCharCount]count >=0");
// Initialize stuff
BYTE *pSrc = bytes;
BYTE *pEnd = pSrc + count;
// Start by assuming we have as many as count, charCount always includes the adjustment
// for the character being decoded
int charCount = count;
int ch = 0;
DecoderFallbackBuffer *fallback = nullptr;
for (;;)
{
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
break;
}
// read next byte. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
int cha = *pSrc;
if (ch == 0) {
// no pending bits
goto ReadChar;
}
pSrc++;
// we are expecting to see trailing bytes like 10vvvvvv
if ((cha & 0xC0) != 0x80) {
// This can be a valid starting byte for another UTF8 byte sequence, so let's put
// the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
pSrc--;
charCount += (ch >> 30);
goto InvalidByteSequence;
}
// fold in the new byte
ch = (ch << 6) | (cha & 0x3F);
if ((ch & FinalByte) == 0) {
Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
"[UTF8Encoding.GetChars]Invariant volation");
if ((ch & SupplimentarySeq) != 0) {
if ((ch & (FinalByte >> 6)) != 0) {
// this is 3rd byte (of 4 byte supplimentary) - nothing to do
continue;
}
// 2nd byte, check for non-shortest form of supplimentary char and the valid
// supplimentary characters in range 0x010000 - 0x10FFFF at the same time
if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
goto InvalidByteSequence;
}
}
else {
// Must be 2nd byte of a 3-byte sequence
// check for non-shortest form of 3 byte seq
if ((ch & (0x1F << 5)) == 0 || // non-shortest form
(ch & (0xF800 >> 6)) == (0xD800 >> 6)) // illegal individually encoded surrogate
{
goto InvalidByteSequence;
}
}
continue;
}
// ready to punch
// adjust for surrogates in non-shortest form
if ((ch & (SupplimentarySeq | 0x1F0000)) == SupplimentarySeq) {
charCount--;
}
goto EncodeChar;
InvalidByteSequence:
// this code fragment should be close to the gotos referencing it
// Have to do fallback for invalid bytes
if (fallback == nullptr)
{
fallback = decoderFallback->CreateFallbackBuffer();
fallback->InternalInitialize(bytes, nullptr);
}
charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
ch = 0;
continue;
ReadChar:
ch = *pSrc;
pSrc++;
ProcessChar:
if (ch > 0x7F) {
// If its > 0x7F, its start of a new multi-byte sequence
// Long sequence, so unreserve our char.
charCount--;
// bit 6 has to be non-zero for start of multibyte chars.
if ((ch & 0x40) == 0) {
// Unexpected trail byte
goto InvalidByteSequence;
}
// start a new long code
if ((ch & 0x20) != 0) {
if ((ch & 0x10) != 0) {
// 4 byte encoding - supplimentary character (2 surrogates)
ch &= 0x0F;
// check that bit 4 is zero and the valid supplimentary character
// range 0x000000 - 0x10FFFF at the same time
if (ch > 0x04) {
ch |= 0xf0;
goto InvalidByteSequence;
}
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
// Final byte flag, count fix if we don't make final byte & supplimentary sequence flag.
ch |= (FinalByte >> 3 * 6) | // Final byte is 3 more bytes from now
(1 << 30) | // If it dies on next byte we'll need an extra char
(3 << (30 - 2 * 6)) | // If it dies on last byte we'll need to subtract a char
(SupplimentarySeq) | (SupplimentarySeq >> 6) |
(SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
// Our character count will be 2 characters for these 4 bytes, so subtract another char
charCount--;
}
else {
// 3 byte encoding
// Add bit flags so that when we check new characters & rotate we'll be flagged correctly.
ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
(ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
// We'll expect 1 character for these 3 bytes, so subtract another char.
charCount--;
}
}
else {
// 2 byte encoding
ch &= 0x1F;
// check for non-shortest form
if (ch <= 1) {
ch |= 0xc0;
goto InvalidByteSequence;
}
// Add bit flags so we'll be flagged correctly
ch |= (FinalByte >> 6);
}
continue;
}
EncodeChar:
#ifdef FASTLOOP
int availableBytes = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough bytes
if (availableBytes <= 13) {
// try to get over the remainder of the ascii characters fast though
BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
}
// we are done
ch = 0;
break;
}
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
BYTE *pStop = pSrc + availableBytes - 7;
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
// get pSrc 2-byte aligned
if (((size_t)pSrc & 0x1) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) {
goto LongCode;
}
}
// get pSrc 4-byte aligned
if (((size_t)pSrc & 0x2) != 0) {
ch = *(USHORT*)pSrc;
if ((ch & 0x8080) != 0) {
goto LongCodeWithMask16;
}
pSrc += 2;
}
// Run 8 + 8 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chb = *(int*)(pSrc + 4);
if (((ch | chb) & (int)0x80808080) != 0) {
goto LongCodeWithMask32;
}
pSrc += 8;
// This is a really small loop - unroll it
if (pSrc >= pStop)
break;
ch = *(int*)pSrc;
chb = *(int*)(pSrc + 4);
if (((ch | chb) & (int)0x80808080) != 0) {
goto LongCodeWithMask32;
}
pSrc += 8;
}
break;
#if BIGENDIAN
LongCodeWithMask32 :
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
LongCodeWithMask16:
ch = (int)(((uint)ch) >> 8);
#else // BIGENDIAN
LongCodeWithMask32:
LongCodeWithMask16:
ch &= 0xFF;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
continue;
}
LongCode:
int chc = *pSrc;
pSrc++;
if (
// bit 6 has to be zero
(ch & 0x40) == 0 ||
// we are expecting to see trailing bytes like 10vvvvvv
(chc & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc &= 0x3F;
// start a new long code
if ((ch & 0x20) != 0) {
// fold the first two bytes together
chc |= (ch & 0x0F) << 6;
if ((ch & 0x10) != 0) {
// 4 byte encoding - surrogate
ch = *pSrc;
if (
// check that bit 4 is zero, the non-shortest form of surrogate
// and the valid surrogate range 0x000000 - 0x10FFFF at the same time
!InRange(chc >> 4, 0x01, 0x10) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
chc = (chc << 6) | (ch & 0x3F);
ch = *(pSrc + 1);
// we are expecting to see trailing bytes like 10vvvvvv
if ((ch & 0xC0) != 0x80) {
goto BadLongCode;
}
pSrc += 2;
// extra byte
charCount--;
}
else {
// 3 byte encoding
ch = *pSrc;
if (
// check for non-shortest form of 3 byte seq
(chc & (0x1F << 5)) == 0 ||
// Can't have surrogates here.
(chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
// we are expecting to see trailing bytes like 10vvvvvv
(ch & 0xC0) != 0x80)
{
goto BadLongCode;
}
pSrc++;
// extra byte
charCount--;
}
}
else {
// 2 byte encoding
// check for non-shortest form
if ((ch & 0x1E) == 0) {
goto BadLongCode;
}
}
// extra byte
charCount--;
}
#endif // FASTLOOP
// no pending bits at this point
ch = 0;
continue;
BadLongCode:
pSrc -= 2;
ch = 0;
continue;
}
// May have a problem if we have to flush
if (ch != 0)
{
// We were already adjusting for these, so need to unadjust
charCount += (ch >> 30);
// Have to do fallback for invalid bytes
if (fallback == nullptr)
{
fallback = decoderFallback->CreateFallbackBuffer();
fallback->InternalInitialize(bytes, nullptr);
}
charCount += FallbackInvalidByteSequence(pSrc, ch, fallback);
}
// Shouldn't have anything in fallback buffer for GetCharCount
// (don't have to check m_throwOnOverflow for count)
Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
"[UTF8Encoding.GetCharCount]Expected empty fallback buffer at end");
InternalDelete(fallback);
return charCount;
}