void XMLReader::doInitDecode()

in src/xercesc/internal/XMLReader.cpp [1510:1871]


void XMLReader::doInitDecode()
{
    switch(fEncoding)
    {
        case XMLRecognizer::UCS_4B :
        case XMLRecognizer::UCS_4L :
        {
            // Remove bom if any
            if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
                ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )
            {
                for (XMLSize_t i = 0; i < fRawBytesAvail; i++)
                    fRawByteBuf[i] = fRawByteBuf[i+4];

                fRawBytesAvail -=4;
            }

            // Look at the raw buffer as UCS4 chars
            const UCS4Ch* asUCS = reinterpret_cast<const UCS4Ch*>(fRawByteBuf);

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Make sure there are at least sizeof(UCS4Ch) bytes to consume.
                if (fRawBufIndex + sizeof(UCS4Ch) > fRawBytesAvail) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Make sure we don't exhaust the limited prolog buffer size.
                // Leave room for a space added at the end of this function.
                if (fCharsAvail == kCharBufSize - 1) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Get out the current 4 byte value and inc our raw buf index
                UCS4Ch curVal = *asUCS++;
                fRawBufIndex += sizeof(UCS4Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                // Make sure its at least semi legal. If not, undo and throw
                if (curVal > 0xFFFF)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Convert the value to an XML char and store it
                fCharSizeBuf[fCharsAvail] = 4;
                fCharBuf[fCharsAvail++] = XMLCh(curVal);

                // Break out on the > character
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::UTF_8 :
        {
            // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.
            //   Don't move to char buf - no one wants to see it.
            //   Note: this causes any encoding= declaration to override
            //         the BOM's attempt to say that the encoding is utf-8.

            // Look at the raw buffer as short chars
            const char* asChars = (const char*)fRawByteBuf;

            if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
                XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgUTF8BOM
                                            , XMLRecognizer::fgUTF8BOMLen) == 0)
            {
                fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
                asChars      += XMLRecognizer::fgUTF8BOMLen;
            }

            //
            //  First check that there are enough bytes to even see the
            //  decl indentifier. If not, get out now with no action since
            //  there is no decl.
            //
            if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
                break;

            // Check for the opening sequence. If not, then no decl
            if (XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgASCIIPre
                                            , XMLRecognizer::fgASCIIPreLen))
            {
                break;
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                const char curCh = *asChars++;
                fRawBufIndex++;

                // Make sure we don't exhaust the limited prolog buffer size.
                // Leave room for a space added at the end of this function.
                if (fCharsAvail == kCharBufSize - 1) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Looks ok, so store it
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = XMLCh(curCh);

                // Break out on a > character
                if (curCh == chCloseAngle)
                    break;

                //
                //  A char greater than 0x7F is not allowed in this case. If
                //  so, undo and throw.
                //
                if (curCh & 0x80)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }
            }
            break;
        }

        case XMLRecognizer::UTF_16B :
        case XMLRecognizer::UTF_16L :
        {
            //
            //  If there is a decl here, we just truncate back the characters
            //  as we go. No surrogate creation would be allowed here in legal
            //  XML, so we consider it a transoding error if we find one.
            //
            if (fRawBytesAvail < 2)
                break;

            XMLSize_t postBOMIndex = 0;
            const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
            if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
            {
                fRawBufIndex += sizeof(UTF16Ch);
                asUTF16++;
                postBOMIndex = fRawBufIndex;
            }

            //  First check that there are enough raw bytes for there to even
            //  be a decl indentifier. If not, then nothing to do.
            //
            if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
            {
                fRawBufIndex = postBOMIndex;
                break;
            }

            //
            //  See we get a match on the prefix. If not, then reset and
            //  break out.
            //
            if (fEncoding == XMLRecognizer::UTF_16B)
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
                {
                    fRawBufIndex = postBOMIndex;
                    break;
                }
            }
             else
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
                {
                    fRawBufIndex = postBOMIndex;
                    break;
                }
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Make sure there are at least sizeof(UTF16Ch) bytes to consume.
                if (fRawBufIndex + sizeof(UTF16Ch) > fRawBytesAvail) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Make sure we don't exhaust the limited prolog buffer size.
                // Leave room for a space added at the end of this function.
                if (fCharsAvail == kCharBufSize - 1) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Get out the current 2 byte value
                UTF16Ch curVal = *asUTF16++;
                fRawBufIndex += sizeof(UTF16Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                //
                //  Store it and bump the target index, implicitly converting
                //  if UTF16Ch and XMLCh are not the same size.
                //
                fCharSizeBuf[fCharsAvail] = 2;
                fCharBuf[fCharsAvail++] = curVal;

                // Break out on a > char
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::EBCDIC :
        {
            //
            //  We use special support in the intrinsic EBCDIC-US transcoder
            //  to go through one char at a time.
            //
            const XMLByte* srcPtr = fRawByteBuf;
            while (1)
            {
                // Transcode one char from the source
                const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
                fRawBufIndex++;

                // Make sure we don't exhaust the limited prolog buffer size.
                // Leave room for a space added at the end of this function.
                if (fCharsAvail == kCharBufSize - 1) {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }


                //
                //  And put it into the character buffer. This stuff has to
                //  look like it was normally transcoded.
                //
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = chCur;

                // If its a > char, then break out
                if (chCur == chCloseAngle)
                    break;

                // Watch for using up all input and get out
                if (fRawBufIndex == fRawBytesAvail)
                    break;
            }
            break;
        }

        default :
            // It should never be anything else here
            fMemoryManager->deallocate(fPublicId);
            fMemoryManager->deallocate(fEncodingStr);
            fMemoryManager->deallocate(fSystemId);
            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
            break;
    }

    //
    //  Ok, by the time we get here, if its a legal XML file we have eaten
    //  the XML/TextDecl. So, if we are a PE and are being referenced from
    //  outside a literal, then we need to throw in an arbitrary space that
    //  is required by XML.
    //
    if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
        fCharBuf[fCharsAvail++] = chSpace;

    //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize
    if (fCalculateSrcOfs)
    {
        fCharOfsBuf[0] = 0;
        for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
            fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
        }
    }
}