in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [2531:2858]
int GetByteCount(WCHAR *chars, int count)
{
// For fallback we may need a fallback buffer.
// We wait to initialize it though in case we don't have any broken input unicode
EncoderFallbackBuffer* fallbackBuffer = nullptr;
WCHAR *pSrc = chars;
WCHAR *pEnd = pSrc + count;
// Start by assuming we have as many as count
int byteCount = count;
int ch = 0;
for (;;) {
// SLOWLOOP: does all range checks, handles all special cases, but it is slow
if (pSrc >= pEnd) {
if (ch == 0) {
// Unroll any fallback that happens at the end
ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
if (ch > 0) {
byteCount++;
goto ProcessChar;
}
}
else {
// Case of surrogates in the fallback.
if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
ch = fallbackBuffer->InternalGetNextChar();
byteCount++;
if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
ch = 0xfffd;
byteCount++;
goto EncodeChar;
}
else if (ch > 0){
goto ProcessChar;
}
else {
byteCount--; // ignore last one.
break;
}
}
}
if (ch <= 0) {
break;
}
// attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
byteCount++;
goto EncodeChar;
}
if (ch > 0) {
Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
"[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));
// use separate helper variables for local contexts so that the jit optimizations
// won't get confused about the variable lifetimes
int cha = *pSrc;
// count the pending surrogate
byteCount++;
// In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
// if (IsLowSurrogate(cha)) {
if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
// Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
ch = 0xfffd;
// ch = cha + (ch << 10) +
// (0x10000
// - CharUnicodeInfo::LOW_SURROGATE_START
// - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );
// Use this next char
pSrc++;
}
// else ch is still high surrogate and encoding will fail (so don't add count)
// attempt to encode the surrogate or partial surrogate
goto EncodeChar;
}
// If we've used a fallback, then we have to check for it
if (fallbackBuffer != nullptr)
{
ch = fallbackBuffer->InternalGetNextChar();
if (ch > 0)
{
// We have an extra byte we weren't expecting.
byteCount++;
goto ProcessChar;
}
}
// read next char. The JIT optimization seems to be getting confused when
// compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
ch = *pSrc;
pSrc++;
ProcessChar:
if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
// we will count this surrogate next time around
byteCount--;
continue;
}
// either good char or partial surrogate
EncodeChar:
// throw exception on partial surrogate if necessary
if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
{
// Lone surrogates aren't allowed
// Have to make a fallback buffer if we don't have one
if (fallbackBuffer == nullptr)
{
// wait on fallbacks if we can
// For fallback we may need a fallback buffer
fallbackBuffer = encoderFallback->CreateFallbackBuffer();
// Set our internal fallback interesting things.
fallbackBuffer->InternalInitialize(chars, chars + count, false);
}
// Do our fallback. Actually we already know its a mixed up surrogate,
// so the ref pSrc isn't gonna do anything.
fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);
// Ignore it if we don't throw (we had preallocated this ch)
byteCount--;
ch = 0;
continue;
}
// Count them
if (ch > 0x7F) {
if (ch > 0x7FF) {
// the extra surrogate byte was compensated by the second surrogate character
// (2 surrogates make 4 bytes. We've already counted 2 bytes, 1 per char)
byteCount++;
}
byteCount++;
}
#if WIN64
// check for overflow
if (byteCount < 0) {
break;
}
#endif
#ifdef FASTLOOP
// If still have fallback don't do fast loop
if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
{
// We're reserving 1 byte for each char by default
byteCount++;
goto ProcessChar;
}
int availableChars = PtrDiff(pEnd, pSrc);
// don't fall into the fast decoding loop if we don't have enough characters
if (availableChars <= 13) {
// try to get over the remainder of the ascii characters fast though
WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
while (pSrc < pLocalEnd) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F)
goto ProcessChar;
}
// we are done
break;
}
#if WIN64
// make sure that we won't get a silent overflow inside the fast loop
// (Fall out to slow loop if we have this many characters)
availableChars &= 0x0FFFFFFF;
#endif
// To compute the upper bound, assume that all characters are ASCII characters at this point,
// the boundary will be decreased for every non-ASCII character we encounter
// Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
WCHAR *pStop = pSrc + availableChars - (3 + 4);
while (pSrc < pStop) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) // Not ASCII
{
if (ch > 0x7FF) // Not 2 Byte
{
if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
goto LongCode;
byteCount++;
}
byteCount++;
}
// get pSrc aligned
if (((size_t)pSrc & 0x2) != 0) {
ch = *pSrc;
pSrc++;
if (ch > 0x7F) // Not ASCII
{
if (ch > 0x7FF) // Not 2 Byte
{
if ((ch & 0xF800) == 0xD800) // See if its a Surrogate
goto LongCode;
byteCount++;
}
byteCount++;
}
}
// Run 2 * 4 characters at a time!
while (pSrc < pStop) {
ch = *(int*)pSrc;
int chc = *(int*)(pSrc + 2);
if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
{
if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
{
goto LongCodeWithMask;
}
if ((ch & (int)0xFF800000) != 0) // Actually 0x07800780 is all we care about (4 bits)
byteCount++;
if ((ch & (int)0xFF80) != 0)
byteCount++;
if ((chc & (int)0xFF800000) != 0)
byteCount++;
if ((chc & (int)0xFF80) != 0)
byteCount++;
}
pSrc += 4;
ch = *(int*)pSrc;
chc = *(int*)(pSrc + 2);
if (((ch | chc) & (int)0xFF80FF80) != 0) // See if not ASCII
{
if (((ch | chc) & (int)0xF800F800) != 0) // See if not 2 Byte
{
goto LongCodeWithMask;
}
if ((ch & (int)0xFF800000) != 0)
byteCount++;
if ((ch & (int)0xFF80) != 0)
byteCount++;
if ((chc & (int)0xFF800000) != 0)
byteCount++;
if ((chc & (int)0xFF80) != 0)
byteCount++;
}
pSrc += 4;
}
break;
LongCodeWithMask:
#if BIGENDIAN
// be careful about the sign extension
ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
ch = (WCHAR)ch;
#endif // BIGENDIAN
pSrc++;
if (ch <= 0x7F) {
continue;
}
LongCode:
// use separate helper variables for slow and fast loop so that the jit optimizations
// won't get confused about the variable lifetimes
if (ch > 0x7FF) {
if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
// 4 byte encoding - high surrogate + low surrogate
int chd = *pSrc;
if (
ch > CharUnicodeInfo::HIGH_SURROGATE_END ||
!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
{
// Back up and drop out to slow loop to figure out error
pSrc--;
break;
}
pSrc++;
// byteCount - this byte is compensated by the second surrogate character
}
byteCount++;
}
byteCount++;
// byteCount - the last byte is already included
}
#endif // FASTLOOP
// no pending char at this point
ch = 0;
}
#if WIN64
// check for overflow
if (byteCount < 0) {
throw ArgumentException("Conversion buffer overflow.");
}
#endif
Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0,
"[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");
InternalDelete(fallbackBuffer);
return byteCount;
}