int GetByteCount()

in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [2531:2858]


    int GetByteCount(WCHAR *chars, int count)
    {
        // For fallback we may need a fallback buffer.
        // We wait to initialize it though in case we don't have any broken input unicode
        EncoderFallbackBuffer* fallbackBuffer = nullptr;
        WCHAR *pSrc = chars;
        WCHAR *pEnd = pSrc + count;

        // Start by assuming we have as many as count
        int byteCount = count;

        int ch = 0;

        for (;;) {
            // SLOWLOOP: does all range checks, handles all special cases, but it is slow
            if (pSrc >= pEnd) {

                if (ch == 0) {
                    // Unroll any fallback that happens at the end
                    ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
                    if (ch > 0) {
                        byteCount++;
                        goto ProcessChar;
                    }
                }
                else {
                    // Case of surrogates in the fallback.
                    if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
                        Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
                            "[UTF8Encoding.GetBytes]expected high surrogate");// , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));

                        ch = fallbackBuffer->InternalGetNextChar();
                        byteCount++;

                        if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                            ch = 0xfffd;
                            byteCount++;
                            goto EncodeChar;
                        }
                        else if (ch > 0){
                            goto ProcessChar;
                        }
                        else {
                            byteCount--; // ignore last one.
                            break;
                        }
                    }
                }

                if (ch <= 0) {
                    break;
                }

                // attempt to encode the partial surrogate (will fallback or ignore it), it'll also subtract 1.
                byteCount++;
                goto EncodeChar;
            }

            if (ch > 0) {
                Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
                    "[UTF8Encoding.GetBytes]expected high surrogate"); // , not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));

                // use separate helper variables for local contexts so that the jit optimizations
                // won't get confused about the variable lifetimes
                int cha = *pSrc;

                // count the pending surrogate
                byteCount++;

                // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
                // if (IsLowSurrogate(cha)) {
                if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                    // Don't need a real # because we're just counting, anything > 0x7ff ('cept surrogate) will do.
                    ch = 0xfffd;
                    //                        ch = cha + (ch << 10) +
                    //                            (0x10000
                    //                            - CharUnicodeInfo::LOW_SURROGATE_START
                    //                            - (CharUnicodeInfo::HIGH_SURROGATE_START << 10) );

                    // Use this next char
                    pSrc++;
                }
                // else ch is still high surrogate and encoding will fail (so don't add count)

                // attempt to encode the surrogate or partial surrogate
                goto EncodeChar;
            }

            // If we've used a fallback, then we have to check for it
            if (fallbackBuffer != nullptr)
            {
                ch = fallbackBuffer->InternalGetNextChar();
                if (ch > 0)
                {
                    // We have an extra byte we weren't expecting.
                    byteCount++;
                    goto ProcessChar;
                }
            }

            // read next char. The JIT optimization seems to be getting confused when
            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
            ch = *pSrc;
            pSrc++;

        ProcessChar:
            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
                // we will count this surrogate next time around
                byteCount--;
                continue;
            }
            // either good char or partial surrogate

        EncodeChar:
            // throw exception on partial surrogate if necessary
            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
            {
                // Lone surrogates aren't allowed
                // Have to make a fallback buffer if we don't have one
                if (fallbackBuffer == nullptr)
                {
                    // wait on fallbacks if we can
                    // For fallback we may need a fallback buffer
                    fallbackBuffer = encoderFallback->CreateFallbackBuffer();

                    // Set our internal fallback interesting things.
                    fallbackBuffer->InternalInitialize(chars, chars + count, false);
                }

                // Do our fallback.  Actually we already know its a mixed up surrogate,
                // so the ref pSrc isn't gonna do anything.
                fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);

                // Ignore it if we don't throw (we had preallocated this ch)
                byteCount--;
                ch = 0;
                continue;
            }

            // Count them
            if (ch > 0x7F) {
                if (ch > 0x7FF) {
                    // the extra surrogate byte was compensated by the second surrogate character
                    // (2 surrogates make 4 bytes.  We've already counted 2 bytes, 1 per char)
                    byteCount++;
                }
                byteCount++;
            }

#if WIN64
            // check for overflow
            if (byteCount < 0) {
                break;
            }
#endif

#ifdef FASTLOOP
            // If still have fallback don't do fast loop
            if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
            {
                // We're reserving 1 byte for each char by default
                byteCount++;
                goto ProcessChar;
            }

            int availableChars = PtrDiff(pEnd, pSrc);

            // don't fall into the fast decoding loop if we don't have enough characters
            if (availableChars <= 13) {
                // try to get over the remainder of the ascii characters fast though
                WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
                while (pSrc < pLocalEnd) {
                    ch = *pSrc;
                    pSrc++;
                    if (ch > 0x7F)
                        goto ProcessChar;
                }

                // we are done
                break;
            }

#if WIN64
            // make sure that we won't get a silent overflow inside the fast loop
            // (Fall out to slow loop if we have this many characters)
            availableChars &= 0x0FFFFFFF;
#endif

            // To compute the upper bound, assume that all characters are ASCII characters at this point,
            //  the boundary will be decreased for every non-ASCII character we encounter
            // Also, we need 3 + 4 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
            WCHAR *pStop = pSrc + availableChars - (3 + 4);

            while (pSrc < pStop) {
                ch = *pSrc;
                pSrc++;

                if (ch > 0x7F)                                                  // Not ASCII
                {
                    if (ch > 0x7FF)                                             // Not 2 Byte
                    {
                        if ((ch & 0xF800) == 0xD800)                            // See if its a Surrogate
                            goto LongCode;
                        byteCount++;
                    }
                    byteCount++;
                }

                // get pSrc aligned
                if (((size_t)pSrc & 0x2) != 0) {
                    ch = *pSrc;
                    pSrc++;
                    if (ch > 0x7F)                                              // Not ASCII
                    {
                        if (ch > 0x7FF)                                         // Not 2 Byte
                        {
                            if ((ch & 0xF800) == 0xD800)                        // See if its a Surrogate
                                goto LongCode;
                            byteCount++;
                        }
                        byteCount++;
                    }
                }

                // Run 2 * 4 characters at a time!
                while (pSrc < pStop) {
                    ch = *(int*)pSrc;
                    int chc = *(int*)(pSrc + 2);
                    if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
                    {
                        if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
                        {
                            goto LongCodeWithMask;
                        }


                        if ((ch & (int)0xFF800000) != 0)             // Actually 0x07800780 is all we care about (4 bits)
                            byteCount++;
                        if ((ch & (int)0xFF80) != 0)
                            byteCount++;
                        if ((chc & (int)0xFF800000) != 0)
                            byteCount++;
                        if ((chc & (int)0xFF80) != 0)
                            byteCount++;
                    }
                    pSrc += 4;

                    ch = *(int*)pSrc;
                    chc = *(int*)(pSrc + 2);
                    if (((ch | chc) & (int)0xFF80FF80) != 0)         // See if not ASCII
                    {
                        if (((ch | chc) & (int)0xF800F800) != 0)     // See if not 2 Byte
                        {
                            goto LongCodeWithMask;
                        }

                        if ((ch & (int)0xFF800000) != 0)
                            byteCount++;
                        if ((ch & (int)0xFF80) != 0)
                            byteCount++;
                        if ((chc & (int)0xFF800000) != 0)
                            byteCount++;
                        if ((chc & (int)0xFF80) != 0)
                            byteCount++;
                    }
                    pSrc += 4;
                }
                break;

            LongCodeWithMask:
#if BIGENDIAN
                // be careful about the sign extension
                ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
                ch = (WCHAR)ch;
#endif // BIGENDIAN
                pSrc++;

                if (ch <= 0x7F) {
                    continue;
                }

            LongCode:
                // use separate helper variables for slow and fast loop so that the jit optimizations
                // won't get confused about the variable lifetimes
                if (ch > 0x7FF) {
                    if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                        // 4 byte encoding - high surrogate + low surrogate

                        int chd = *pSrc;
                        if (
                            ch > CharUnicodeInfo::HIGH_SURROGATE_END ||
                            !InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
                        {
                            // Back up and drop out to slow loop to figure out error
                            pSrc--;
                            break;
                        }
                        pSrc++;

                        // byteCount - this byte is compensated by the second surrogate character
                    }
                    byteCount++;
                }
                byteCount++;

                // byteCount - the last byte is already included
            }
#endif // FASTLOOP

            // no pending char at this point
            ch = 0;
        }

#if WIN64
        // check for overflow
        if (byteCount < 0) {
            throw ArgumentException("Conversion buffer overflow.");
        }
#endif

        Contract::Assert(fallbackBuffer == nullptr || fallbackBuffer->GetRemaining() == 0,
            "[UTF8Encoding.GetByteCount]Expected Empty fallback buffer");

        InternalDelete(fallbackBuffer);

        return byteCount;
    }