int GetChars()

in CoreCLRProfiler/native/coreclr_headers/src/pal/src/locale/utf8.cpp [1637:2139]
337 lines of code
59 McCabe index (conditional complexity)

    int GetChars(BYTE* bytes, int byteCount, WCHAR* chars, int charCount)
    {
        Contract::Assert(chars != nullptr, "[UTF8Encoding.GetChars]chars!=nullptr");
        Contract::Assert(byteCount >= 0, "[UTF8Encoding.GetChars]byteCount >=0");
        Contract::Assert(charCount >= 0, "[UTF8Encoding.GetChars]charCount >=0");
        Contract::Assert(bytes != nullptr, "[UTF8Encoding.GetChars]bytes!=nullptr");

        BYTE *pSrc = bytes;
        WCHAR *pTarget = chars;

        BYTE *pEnd = pSrc + byteCount;
        WCHAR *pAllocatedBufferEnd = pTarget + charCount;

        int ch = 0;

        DecoderFallbackBuffer *fallback = nullptr;

        for (;;)
        {
            // SLOWLOOP: does all range checks, handles all special cases, but it is slow

            if (pSrc >= pEnd) {
                break;
            }

            // read next byte. The JIT optimization seems to be getting confused when
            // compiling "ch = *pSrc++;", so rather use "ch = *pSrc; pSrc++;" instead
            int cha = *pSrc;

            if (ch == 0) {
                // no pending bits
                goto ReadChar;
            }

            pSrc++;

            // we are expecting to see trailing bytes like 10vvvvvv
            if ((cha & 0xC0) != 0x80) {
                // This can be a valid starting byte for another UTF8 byte sequence, so let's put
                // the current byte back, and try to see if this is a valid byte for another UTF8 byte sequence
                pSrc--;
                goto InvalidByteSequence;
            }

            // fold in the new byte
            ch = (ch << 6) | (cha & 0x3F);

            if ((ch & FinalByte) == 0) {
                // Not at last byte yet
                Contract::Assert((ch & (SupplimentarySeq | ThreeByteSeq)) != 0,
                    "[UTF8Encoding.GetChars]Invariant volation");

                if ((ch & SupplimentarySeq) != 0) {
                    // Its a 4-byte supplimentary sequence
                    if ((ch & (FinalByte >> 6)) != 0) {
                        // this is 3rd byte of 4 byte sequence - nothing to do
                        continue;
                    }

                    // 2nd byte of 4 bytes
                    // check for non-shortest form of surrogate and the valid surrogate
                    // range 0x000000 - 0x10FFFF at the same time
                    if (!InRange(ch & 0x1F0, 0x10, 0x100)) {
                        goto InvalidByteSequence;
                    }
                }
                else {
                    // Must be 2nd byte of a 3-byte sequence
                    // check for non-shortest form of 3 byte seq
                    if ((ch & (0x1F << 5)) == 0 ||                  // non-shortest form
                        (ch & (0xF800 >> 6)) == (0xD800 >> 6))     // illegal individually encoded surrogate
                    {
                        goto InvalidByteSequence;
                    }
                }
                continue;
            }

            // ready to punch

            // surrogate in shortest form?
            // Might be possible to get rid of this?  Already did non-shortest check for 4-byte sequence when reading 2nd byte?
            if ((ch & (SupplimentarySeq | 0x1F0000)) > SupplimentarySeq) {
                // let the range check for the second char throw the exception
                if (pTarget < pAllocatedBufferEnd) {
                    *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
                        (SHORT)((CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10))));
                    pTarget++;

                    ch = (ch & 0x3FF) +
                        (int)(CharUnicodeInfo::LOW_SURROGATE_START);
                }
            }

            goto EncodeChar;

        InvalidByteSequence:
            // this code fragment should be close to the gotos referencing it
            // Have to do fallback for invalid bytes
            if (fallback == nullptr)
            {
                fallback = decoderFallback->CreateFallbackBuffer();
                fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
            }

            // That'll back us up the appropriate # of bytes if we didn't get anywhere
            if (!FallbackInvalidByteSequence(&pSrc, ch, fallback, &pTarget))
            {
                // Ran out of buffer space
                // Need to throw an exception?
                Contract::Assert(pSrc >= bytes || pTarget == chars,
                    "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer after fallback");
                fallback->InternalReset();
                ThrowCharsOverflow(pTarget == chars);
                ch = 0;
                break;
            }
            Contract::Assert(pSrc >= bytes,
                "[UTF8Encoding.GetChars]Expected invalid byte sequence to have remained within the byte array");
            ch = 0;
            continue;

        ReadChar:
            ch = *pSrc;
            pSrc++;

        ProcessChar:
            if (ch > 0x7F) {
                // If its > 0x7F, its start of a new multi-byte sequence

                // bit 6 has to be non-zero
                if ((ch & 0x40) == 0) {
                    goto InvalidByteSequence;
                }

                // start a new long code
                if ((ch & 0x20) != 0) {
                    if ((ch & 0x10) != 0) {
                        // 4 byte encoding - supplimentary character (2 surrogates)

                        ch &= 0x0F;

                        // check that bit 4 is zero and the valid supplimentary character
                        // range 0x000000 - 0x10FFFF at the same time
                        if (ch > 0x04) {
                            ch |= 0xf0;
                            goto InvalidByteSequence;
                        }

                        ch |= (FinalByte >> 3 * 6) | (1 << 30) | (3 << (30 - 2 * 6)) |
                            (SupplimentarySeq) | (SupplimentarySeq >> 6) |
                            (SupplimentarySeq >> 2 * 6) | (SupplimentarySeq >> 3 * 6);
                    }
                    else {
                        // 3 byte encoding
                        ch = (ch & 0x0F) | ((FinalByte >> 2 * 6) | (1 << 30) |
                            (ThreeByteSeq) | (ThreeByteSeq >> 6) | (ThreeByteSeq >> 2 * 6));
                    }
                }
                else {
                    // 2 byte encoding

                    ch &= 0x1F;

                    // check for non-shortest form
                    if (ch <= 1) {
                        ch |= 0xc0;
                        goto InvalidByteSequence;
                    }

                    ch |= (FinalByte >> 6);
                }
                continue;
            }

        EncodeChar:
            // write the pending character
            if (pTarget >= pAllocatedBufferEnd)
            {
                // Fix chars so we make sure to throw if we didn't output anything
                ch &= 0x1fffff;
                if (ch > 0x7f)
                {
                    if (ch > 0x7ff)
                    {
                        if (ch >= CharUnicodeInfo::LOW_SURROGATE_START &&
                            ch <= CharUnicodeInfo::LOW_SURROGATE_END)
                        {
                            pSrc--;     // It was 4 bytes
                            pTarget--;  // 1 was stored already, but we can't remember 1/2, so back up
                        }
                        else if (ch > 0xffff)
                        {
                            pSrc--;     // It was 4 bytes, nothing was stored
                        }
                        pSrc--;         // It was at least 3 bytes
                    }
                    pSrc--;             // It was at least 2 bytes
                }
                pSrc--;

                // Throw that we don't have enough room (pSrc could be < chars if we had started to process
                // a 4 byte sequence alredy)
                Contract::Assert(pSrc >= bytes || pTarget == chars,
                    "[UTF8Encoding.GetChars]Expected pSrc to be within input buffer or throw due to no output]");
                ThrowCharsOverflow(pTarget == chars);

                // Don't store ch in decoder, we already backed up to its start
                ch = 0;

                // Didn't throw, just use this buffer size.
                break;
            }
            *pTarget = (WCHAR)ch;
            pTarget++;

#ifdef FASTLOOP
            int availableChars = PtrDiff(pAllocatedBufferEnd, pTarget);
            int availableBytes = PtrDiff(pEnd, pSrc);

            // don't fall into the fast decoding loop if we don't have enough bytes
            // Test for availableChars is done because pStop would be <= pTarget.
            if (availableBytes <= 13) {
                // we may need as many as 1 character per byte
                if (availableChars < availableBytes) {
                    // not enough output room.  no pending bits at this point
                    ch = 0;
                    continue;
                }

                // try to get over the remainder of the ascii characters fast though
                BYTE* pLocalEnd = pEnd; // hint to get pLocalEnd enregistered
                while (pSrc < pLocalEnd) {
                    ch = *pSrc;
                    pSrc++;

                    if (ch > 0x7F)
                        goto ProcessChar;

                    *pTarget = (WCHAR)ch;
                    pTarget++;
                }
                // we are done
                ch = 0;
                break;
            }

            // we may need as many as 1 character per byte, so reduce the byte count if necessary.
            // If availableChars is too small, pStop will be before pTarget and we won't do fast loop.
            if (availableChars < availableBytes) {
                availableBytes = availableChars;
            }

            // To compute the upper bound, assume that all characters are ASCII characters at this point,
            //  the boundary will be decreased for every non-ASCII character we encounter
            // Also, we need 7 chars reserve for the unrolled ansi decoding loop and for decoding of multibyte sequences
            WCHAR *pStop = pTarget + availableBytes - 7;

            while (pTarget < pStop) {
                ch = *pSrc;
                pSrc++;

                if (ch > 0x7F) {
                    goto LongCode;
                }
                *pTarget = (WCHAR)ch;
                pTarget++;

                // get pSrc to be 2-byte aligned
                if ((((size_t)pSrc) & 0x1) != 0) {
                    ch = *pSrc;
                    pSrc++;
                    if (ch > 0x7F) {
                        goto LongCode;
                    }
                    *pTarget = (WCHAR)ch;
                    pTarget++;
                }

                // get pSrc to be 4-byte aligned
                if ((((size_t)pSrc) & 0x2) != 0) {
                    ch = *(USHORT*)pSrc;
                    if ((ch & 0x8080) != 0) {
                        goto LongCodeWithMask16;
                    }

                    // Unfortunately, this is endianess sensitive
#if BIGENDIAN
                    *pTarget = (WCHAR)((ch >> 8) & 0x7F);
                    pSrc += 2;
                    *(pTarget + 1) = (WCHAR)(ch & 0x7F);
                    pTarget += 2;
#else // BIGENDIAN
                    *pTarget = (WCHAR)(ch & 0x7F);
                    pSrc += 2;
                    *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
                    pTarget += 2;
#endif // BIGENDIAN
                }

                // Run 8 characters at a time!
                while (pTarget < pStop) {
                    ch = *(int*)pSrc;
                    int chb = *(int*)(pSrc + 4);
                    if (((ch | chb) & (int)0x80808080) != 0) {
                        goto LongCodeWithMask32;
                    }

                    // Unfortunately, this is endianess sensitive
#if BIGENDIAN
                    *pTarget = (WCHAR)((ch >> 24) & 0x7F);
                    *(pTarget + 1) = (WCHAR)((ch >> 16) & 0x7F);
                    *(pTarget + 2) = (WCHAR)((ch >> 8) & 0x7F);
                    *(pTarget + 3) = (WCHAR)(ch & 0x7F);
                    pSrc += 8;
                    *(pTarget + 4) = (WCHAR)((chb >> 24) & 0x7F);
                    *(pTarget + 5) = (WCHAR)((chb >> 16) & 0x7F);
                    *(pTarget + 6) = (WCHAR)((chb >> 8) & 0x7F);
                    *(pTarget + 7) = (WCHAR)(chb & 0x7F);
                    pTarget += 8;
#else // BIGENDIAN
                    *pTarget = (WCHAR)(ch & 0x7F);
                    *(pTarget + 1) = (WCHAR)((ch >> 8) & 0x7F);
                    *(pTarget + 2) = (WCHAR)((ch >> 16) & 0x7F);
                    *(pTarget + 3) = (WCHAR)((ch >> 24) & 0x7F);
                    pSrc += 8;
                    *(pTarget + 4) = (WCHAR)(chb & 0x7F);
                    *(pTarget + 5) = (WCHAR)((chb >> 8) & 0x7F);
                    *(pTarget + 6) = (WCHAR)((chb >> 16) & 0x7F);
                    *(pTarget + 7) = (WCHAR)((chb >> 24) & 0x7F);
                    pTarget += 8;
#endif // BIGENDIAN
                }
                break;

#if BIGENDIAN
                LongCodeWithMask32 :
                    // be careful about the sign extension
                    ch = (int)(((uint)ch) >> 16);
                LongCodeWithMask16:
                    ch = (int)(((uint)ch) >> 8);
#else // BIGENDIAN
            LongCodeWithMask32:
            LongCodeWithMask16:
                ch &= 0xFF;
#endif // BIGENDIAN
                pSrc++;
                if (ch <= 0x7F) {
                    *pTarget = (WCHAR)ch;
                    pTarget++;
                    continue;
                }

            LongCode:
                int chc = *pSrc;
                pSrc++;

                if (
                    // bit 6 has to be zero
                    (ch & 0x40) == 0 ||
                    // we are expecting to see trailing bytes like 10vvvvvv
                    (chc & 0xC0) != 0x80)
                {
                    goto BadLongCode;
                }

                chc &= 0x3F;

                // start a new long code
                if ((ch & 0x20) != 0) {

                    // fold the first two bytes together
                    chc |= (ch & 0x0F) << 6;

                    if ((ch & 0x10) != 0) {
                        // 4 byte encoding - surrogate
                        ch = *pSrc;
                        if (
                            // check that bit 4 is zero, the non-shortest form of surrogate
                            // and the valid surrogate range 0x000000 - 0x10FFFF at the same time
                            !InRange(chc >> 4, 0x01, 0x10) ||
                            // we are expecting to see trailing bytes like 10vvvvvv
                            (ch & 0xC0) != 0x80)
                        {
                            goto BadLongCode;
                        }

                        chc = (chc << 6) | (ch & 0x3F);

                        ch = *(pSrc + 1);
                        // we are expecting to see trailing bytes like 10vvvvvv
                        if ((ch & 0xC0) != 0x80) {
                            goto BadLongCode;
                        }
                        pSrc += 2;

                        ch = (chc << 6) | (ch & 0x3F);

                        *pTarget = (WCHAR)(((ch >> 10) & 0x7FF) +
                            (SHORT)(CharUnicodeInfo::HIGH_SURROGATE_START - (0x10000 >> 10)));
                        pTarget++;

                        ch = (ch & 0x3FF) +
                            (SHORT)(CharUnicodeInfo::LOW_SURROGATE_START);

                        // extra byte, we're already planning 2 chars for 2 of these bytes,
                        // but the big loop is testing the target against pStop, so we need
                        // to subtract 2 more or we risk overrunning the input.  Subtract
                        // one here and one below.
                        pStop--;
                    }
                    else {
                        // 3 byte encoding
                        ch = *pSrc;
                        if (
                            // check for non-shortest form of 3 byte seq
                            (chc & (0x1F << 5)) == 0 ||
                            // Can't have surrogates here.
                            (chc & (0xF800 >> 6)) == (0xD800 >> 6) ||
                            // we are expecting to see trailing bytes like 10vvvvvv
                            (ch & 0xC0) != 0x80)
                        {
                            goto BadLongCode;
                        }
                        pSrc++;

                        ch = (chc << 6) | (ch & 0x3F);

                        // extra byte, we're only expecting 1 char for each of these 3 bytes,
                        // but the loop is testing the target (not source) against pStop, so
                        // we need to subtract 2 more or we risk overrunning the input.
                        // Subtract 1 here and one more below
                        pStop--;
                    }
                }
                else {
                    // 2 byte encoding

                    ch &= 0x1F;

                    // check for non-shortest form
                    if (ch <= 1) {
                        goto BadLongCode;
                    }
                    ch = (ch << 6) | chc;
                }

                *pTarget = (WCHAR)ch;
                pTarget++;

                // extra byte, we're only expecting 1 char for each of these 2 bytes,
                // but the loop is testing the target (not source) against pStop.
                // subtract an extra count from pStop so that we don't overrun the input.
                pStop--;
            }
#endif // FASTLOOP

            Contract::Assert(pTarget <= pAllocatedBufferEnd, "[UTF8Encoding.GetChars]pTarget <= pAllocatedBufferEnd");

            // no pending bits at this point
            ch = 0;
            continue;

        BadLongCode:
            pSrc -= 2;
            ch = 0;
            continue;
        }

        if (ch != 0)
        {
            // Have to do fallback for invalid bytes
            if (fallback == nullptr)
            {
                fallback = decoderFallback->CreateFallbackBuffer();
                fallback->InternalInitialize(bytes, pAllocatedBufferEnd);
            }

            // This'll back us up the appropriate # of bytes if we didn't get anywhere
            if (!FallbackInvalidByteSequence(pSrc, ch, fallback))
            {
                Contract::Assert(pSrc >= bytes || pTarget == chars,
                    "[UTF8Encoding.GetChars]Expected to throw or remain in byte buffer while flushing");

                // Ran out of buffer space
                // Need to throw an exception?
                fallback->InternalReset();
                ThrowCharsOverflow(pTarget == chars);
            }
            Contract::Assert(pSrc >= bytes,
                "[UTF8Encoding.GetChars]Expected flushing invalid byte sequence to have remained within the byte array");
            ch = 0;
        }

        // Shouldn't have anything in fallback buffer for GetChars
        // (don't have to check m_throwOnOverflow for chars)
        Contract::Assert(fallback == nullptr || fallback->GetRemaining() == 0,
            "[UTF8Encoding.GetChars]Expected empty fallback buffer at end");

        InternalDelete(fallback);

        return PtrDiff(pTarget, chars);
    }