int GetBytes()

in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [2141:2529]


    int GetBytes(WCHAR* chars, int charCount, BYTE* bytes, int byteCount)
    {
        Contract::Assert(chars != nullptr, "[UTF8Encoding.GetBytes]chars!=nullptr");
        Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetBytes]byteCount >=0");
        Contract::Assert(charCount >= 0, "[UTF8Encoding.GetBytes]charCount >=0");
        Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetBytes]bytes!=nullptr");

        // For fallback we may need a fallback buffer.
        // We wait to initialize it though in case we don't have any broken input unicode
        EncoderFallbackBuffer* fallbackBuffer = nullptr;
        WCHAR *pSrc = chars;
        BYTE *pTarget = bytes;

        WCHAR *pEnd = pSrc + charCount;
        BYTE *pAllocatedBufferEnd = pTarget + byteCount;

        int ch = 0;

        // assume that JIT will enregister pSrc, pTarget and ch

        for (;;) {
            // SLOWLOOP: does all range checks, handles all special cases, but it is slow

            if (pSrc >= pEnd) {

                if (ch == 0) {
                    // Check if there's anything left to get out of the fallback buffer
                    ch = fallbackBuffer != nullptr ? fallbackBuffer->InternalGetNextChar() : 0;
                    if (ch > 0) {
                        goto ProcessChar;
                    }
                }
                else {
                    // Case of leftover surrogates in the fallback buffer
                    if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack) {
                        Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
                            "[UTF8Encoding.GetBytes]expected high surrogate"); //, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));

                        int cha = ch;

                        ch = fallbackBuffer->InternalGetNextChar();

                        if (InRange(ch, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                            ch = ch + (cha << 10) + (0x10000 - CharUnicodeInfo::LOW_SURROGATE_START - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));
                            goto EncodeChar;
                        }
                        else if (ch > 0){
                            goto ProcessChar;
                        }
                        else {
                            break;
                        }
                    }
                }

                // attempt to encode the partial surrogate (will fail or ignore)
                if (ch > 0)
                    goto EncodeChar;

                // We're done
                break;
            }

            if (ch > 0) {
                // We have a high surrogate left over from a previous loop.
                Contract::Assert(ch >= 0xD800 && ch <= 0xDBFF,
                    "[UTF8Encoding.GetBytes]expected high surrogate");//, not 0x" + ((int)ch).ToString("X4", CultureInfo.InvariantCulture));

                // use separate helper variables for local contexts so that the jit optimizations
                // won't get confused about the variable lifetimes
                int cha = *pSrc;

                // In previous byte, we encountered a high surrogate, so we are expecting a low surrogate here.
                // if (IsLowSurrogate(cha)) {
                if (InRange(cha, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                    ch = cha + (ch << 10) +
                        (0x10000
                        - CharUnicodeInfo::LOW_SURROGATE_START
                        - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));

                    pSrc++;
                }
                // else ch is still high surrogate and encoding will fail

                // attempt to encode the surrogate or partial surrogate
                goto EncodeChar;
            }

            // If we've used a fallback, then we have to check for it
            if (fallbackBuffer != nullptr)
            {
                ch = fallbackBuffer->InternalGetNextChar();
                if (ch > 0) goto ProcessChar;
            }

            // read next char. The JIT optimization seems to be getting confused when
            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
            ch = *pSrc;
            pSrc++;

        ProcessChar:
            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::HIGH_SURROGATE_END)) {
                continue;
            }
            // either good char or partial surrogate

        EncodeChar:
            // throw exception on partial surrogate if necessary
            if (InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END))
            {
                // Lone surrogates aren't allowed, we have to do fallback for them
                // Have to make a fallback buffer if we don't have one
                if (fallbackBuffer == nullptr)
                {
                    // wait on fallbacks if we can
                    // For fallback we may need a fallback buffer
                    fallbackBuffer = encoderFallback->CreateFallbackBuffer();

                    // Set our internal fallback interesting things.
                    fallbackBuffer->InternalInitialize(chars, pEnd, true);
                }

                // Do our fallback.  Actually we already know its a mixed up surrogate,
                // so the ref pSrc isn't gonna do anything.
                fallbackBuffer->InternalFallback((WCHAR)ch, &pSrc);

                // Ignore it if we don't throw
                ch = 0;
                continue;
            }

            // Count bytes needed
            int bytesNeeded = 1;
            if (ch > 0x7F) {
                if (ch > 0x7FF) {
                    if (ch > 0xFFFF) {
                        bytesNeeded++;  // 4 bytes (surrogate pair)
                    }
                    bytesNeeded++;      // 3 bytes (800-FFFF)
                }
                bytesNeeded++;          // 2 bytes (80-7FF)
            }

            if (pTarget > pAllocatedBufferEnd - bytesNeeded) {
                // Left over surrogate from last time will cause pSrc == chars, so we'll throw
                if (fallbackBuffer != nullptr && fallbackBuffer->bFallingBack)
                {
                    fallbackBuffer->MovePrevious();              // Didn't use this fallback char
                    if (ch > 0xFFFF)
                        fallbackBuffer->MovePrevious();          // Was surrogate, didn't use 2nd part either
                }
                else
                {
                    pSrc--;                                     // Didn't use this char
                    if (ch > 0xFFFF)
                        pSrc--;                                 // Was surrogate, didn't use 2nd part either
                }
                Contract::Assert(pSrc >= chars || pTarget == bytes,
                    "[UTF8Encoding.GetBytes]Expected pSrc to be within buffer or to throw with insufficient room.");
                ThrowBytesOverflow(pTarget == bytes);  // Throw if we must
                ch = 0;                                         // Nothing left over (we backed up to start of pair if supplimentary)
                break;
            }

            if (ch <= 0x7F) {
                *pTarget = (BYTE)ch;
            }
            else {
                // use separate helper variables for local contexts so that the jit optimizations
                // won't get confused about the variable lifetimes
                int chb;
                if (ch <= 0x7FF) {
                    // 2 BYTE encoding
                    chb = (BYTE)(0xC0 | (ch >> 6));
                }
                else
                {
                    if (ch <= 0xFFFF) {
                        chb = (BYTE)(0xE0 | (ch >> 12));
                    }
                    else
                    {
                        *pTarget = (BYTE)(0xF0 | (ch >> 18));
                        pTarget++;

                        chb = 0x80 | ((ch >> 12) & 0x3F);
                    }
                    *pTarget = (BYTE)chb;
                    pTarget++;

                    chb = 0x80 | ((ch >> 6) & 0x3F);
                }
                *pTarget = (BYTE)chb;
                pTarget++;

                *pTarget = (BYTE)0x80 | (ch & 0x3F);
            }
            pTarget++;


#ifdef FASTLOOP
            // If still have fallback don't do fast loop
            if (fallbackBuffer != nullptr && (ch = fallbackBuffer->InternalGetNextChar()) != 0)
                goto ProcessChar;

            int availableChars = PtrDiff(pEnd, pSrc);
            int availableBytes = PtrDiff(pAllocatedBufferEnd, pTarget);

            // don't fall into the fast decoding loop if we don't have enough characters
            // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
            if (availableChars <= 13) {
                // we are hoping for 1 BYTE per char
                if (availableBytes < availableChars) {
                    // not enough output room.  no pending bits at this point
                    ch = 0;
                    continue;
                }

                // try to get over the remainder of the ascii characters fast though
                WCHAR* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
                while (pSrc < pLocalEnd) {
                    ch = *pSrc;
                    pSrc++;

                    // Not ASCII, need more than 1 BYTE per char
                    if (ch > 0x7F)
                        goto ProcessChar;

                    *pTarget = (BYTE)ch;
                    pTarget++;
                }
                // we are done, let ch be 0 to clear encoder
                ch = 0;
                break;
            }

            // we need at least 1 BYTE per character, but Convert might allow us to convert
            // only part of the input, so try as much as we can.  Reduce charCount if necessary
            if (availableBytes < availableChars)
            {
                availableChars = availableBytes;
            }

            // FASTLOOP:
            // - optimistic range checks
            // - fallbacks to the slow loop for all special cases, exception throwing, etc.

            // To compute the upper bound, assume that all characters are ASCII characters at this point,
            //  the boundary will be decreased for every non-ASCII character we encounter
            // Also, we need 5 chars reserve for the unrolled ansi decoding loop and for decoding of surrogates
            // If there aren't enough bytes for the output, then pStop will be <= pSrc and will bypass the loop.
            WCHAR *pStop = pSrc + availableChars - 5;

            while (pSrc < pStop) {
                ch = *pSrc;
                pSrc++;

                if (ch > 0x7F) {
                    goto LongCode;
                }
                *pTarget = (BYTE)ch;
                pTarget++;

                // get pSrc aligned
                if (((size_t)pSrc & 0x2) != 0) {
                    ch = *pSrc;
                    pSrc++;
                    if (ch > 0x7F) {
                        goto LongCode;
                    }
                    *pTarget = (BYTE)ch;
                    pTarget++;
                }

                // Run 4 characters at a time!
                while (pSrc < pStop) {
                    ch = *(int*)pSrc;
                    int chc = *(int*)(pSrc + 2);
                    if (((ch | chc) & (int)0xFF80FF80) != 0) {
                        goto LongCodeWithMask;
                    }

                    // Unfortunately, this is endianess sensitive
#if BIGENDIAN
                    *pTarget = (BYTE)(ch >> 16);
                    *(pTarget + 1) = (BYTE)ch;
                    pSrc += 4;
                    *(pTarget + 2) = (BYTE)(chc >> 16);
                    *(pTarget + 3) = (BYTE)chc;
                    pTarget += 4;
#else // BIGENDIAN
                    *pTarget = (BYTE)ch;
                    *(pTarget + 1) = (BYTE)(ch >> 16);
                    pSrc += 4;
                    *(pTarget + 2) = (BYTE)chc;
                    *(pTarget + 3) = (BYTE)(chc >> 16);
                    pTarget += 4;
#endif // BIGENDIAN
                }
                continue;

            LongCodeWithMask:
#if BIGENDIAN
                // be careful about the sign extension
                ch = (int)(((uint)ch) >> 16);
#else // BIGENDIAN
                ch = (WCHAR)ch;
#endif // BIGENDIAN
                pSrc++;

                if (ch > 0x7F) {
                    goto LongCode;
                }
                *pTarget = (BYTE)ch;
                pTarget++;
                continue;

            LongCode:
                // use separate helper variables for slow and fast loop so that the jit optimizations
                // won't get confused about the variable lifetimes
                int chd;
                if (ch <= 0x7FF) {
                    // 2 BYTE encoding
                    chd = 0xC0 | (ch >> 6);
                }
                else {
                    if (!InRange(ch, CharUnicodeInfo::HIGH_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                        // 3 BYTE encoding
                        chd = 0xE0 | (ch >> 12);
                    }
                    else
                    {
                        // 4 BYTE encoding - high surrogate + low surrogate
                        if (ch > CharUnicodeInfo::HIGH_SURROGATE_END) {
                            // low without high -> bad, try again in slow loop
                            pSrc -= 1;
                            break;
                        }

                        chd = *pSrc;
                        pSrc++;

                        // if (!IsLowSurrogate(chd)) {
                        if (!InRange(chd, CharUnicodeInfo::LOW_SURROGATE_START, CharUnicodeInfo::LOW_SURROGATE_END)) {
                            // high not followed by low -> bad, try again in slow loop
                            pSrc -= 2;
                            break;
                        }

                        ch = chd + (ch << 10) +
                            (0x10000
                            - CharUnicodeInfo::LOW_SURROGATE_START
                            - (CharUnicodeInfo::HIGH_SURROGATE_START << 10));

                        *pTarget = (BYTE)(0xF0 | (ch >> 18));
                        // pStop - this BYTE is compensated by the second surrogate character
                        // 2 input chars require 4 output bytes.  2 have been anticipated already
                        // and 2 more will be accounted for by the 2 pStop-- calls below.
                        pTarget++;

                        chd = 0x80 | ((ch >> 12) & 0x3F);
                    }
                    *pTarget = (BYTE)chd;
                    pStop--;                    // 3 BYTE sequence for 1 char, so need pStop-- and the one below too.
                    pTarget++;

                    chd = 0x80 | ((ch >> 6) & 0x3F);
                }
                *pTarget = (BYTE)chd;
                pStop--;                        // 2 BYTE sequence for 1 char so need pStop--.
                pTarget++;

                *pTarget = (BYTE)(0x80 | (ch & 0x3F));
                // pStop - this BYTE is already included
                pTarget++;
            }

            Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetBytes]pTarget <= pAllocatedBufferEnd");

#endif // FASTLOOP

            // no pending char at this point
            ch = 0;
        }

        InternalDelete(fallbackBuffer);

        return (int)(pTarget - bytes);
    }