in src/xercesc/internal/XMLReader.cpp [1510:1871]
void XMLReader::doInitDecode()
{
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
// Remove bom if any
if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) )
{
for (XMLSize_t i = 0; i < fRawBytesAvail; i++)
fRawByteBuf[i] = fRawByteBuf[i+4];
fRawBytesAvail -=4;
}
// Look at the raw buffer as UCS4 chars
const UCS4Ch* asUCS = reinterpret_cast<const UCS4Ch*>(fRawByteBuf);
while (fRawBufIndex < fRawBytesAvail)
{
// Make sure there are at least sizeof(UCS4Ch) bytes to consume.
if (fRawBufIndex + sizeof(UCS4Ch) > fRawBytesAvail) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Get out the current 4 byte value and inc our raw buf index
UCS4Ch curVal = *asUCS++;
fRawBufIndex += sizeof(UCS4Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
// Make sure its at least semi legal. If not, undo and throw
if (curVal > 0xFFFF)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Convert the value to an XML char and store it
fCharSizeBuf[fCharsAvail] = 4;
fCharBuf[fCharsAvail++] = XMLCh(curVal);
// Break out on the > character
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
// Don't move to char buf - no one wants to see it.
// Note: this causes any encoding= declaration to override
// the BOM's attempt to say that the encoding is utf-8.
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
asChars += XMLRecognizer::fgUTF8BOMLen;
}
//
// First check that there are enough bytes to even see the
// decl indentifier. If not, get out now with no action since
// there is no decl.
//
if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
break;
// Check for the opening sequence. If not, then no decl
if (XMLString::compareNString( asChars
, XMLRecognizer::fgASCIIPre
, XMLRecognizer::fgASCIIPreLen))
{
break;
}
while (fRawBufIndex < fRawBytesAvail)
{
const char curCh = *asChars++;
fRawBufIndex++;
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Looks ok, so store it
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = XMLCh(curCh);
// Break out on a > character
if (curCh == chCloseAngle)
break;
//
// A char greater than 0x7F is not allowed in this case. If
// so, undo and throw.
//
if (curCh & 0x80)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
//
// If there is a decl here, we just truncate back the characters
// as we go. No surrogate creation would be allowed here in legal
// XML, so we consider it a transoding error if we find one.
//
if (fRawBytesAvail < 2)
break;
XMLSize_t postBOMIndex = 0;
const UTF16Ch* asUTF16 = reinterpret_cast<const UTF16Ch*>(&fRawByteBuf[fRawBufIndex]);
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
asUTF16++;
postBOMIndex = fRawBufIndex;
}
// First check that there are enough raw bytes for there to even
// be a decl indentifier. If not, then nothing to do.
//
if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
{
fRawBufIndex = postBOMIndex;
break;
}
//
// See we get a match on the prefix. If not, then reset and
// break out.
//
if (fEncoding == XMLRecognizer::UTF_16B)
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
else
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
while (fRawBufIndex < fRawBytesAvail)
{
// Make sure there are at least sizeof(UTF16Ch) bytes to consume.
if (fRawBufIndex + sizeof(UTF16Ch) > fRawBytesAvail) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
// Get out the current 2 byte value
UTF16Ch curVal = *asUTF16++;
fRawBufIndex += sizeof(UTF16Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
//
// Store it and bump the target index, implicitly converting
// if UTF16Ch and XMLCh are not the same size.
//
fCharSizeBuf[fCharsAvail] = 2;
fCharBuf[fCharsAvail++] = curVal;
// Break out on a > char
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::EBCDIC :
{
//
// We use special support in the intrinsic EBCDIC-US transcoder
// to go through one char at a time.
//
const XMLByte* srcPtr = fRawByteBuf;
while (1)
{
// Transcode one char from the source
const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
fRawBufIndex++;
// Make sure we don't exhaust the limited prolog buffer size.
// Leave room for a space added at the end of this function.
if (fCharsAvail == kCharBufSize - 1) {
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
, fMemoryManager
);
}
//
// And put it into the character buffer. This stuff has to
// look like it was normally transcoded.
//
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = chCur;
// If its a > char, then break out
if (chCur == chCloseAngle)
break;
// Watch for using up all input and get out
if (fRawBufIndex == fRawBytesAvail)
break;
}
break;
}
default :
// It should never be anything else here
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fSystemId);
ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
break;
}
//
// Ok, by the time we get here, if its a legal XML file we have eaten
// the XML/TextDecl. So, if we are a PE and are being referenced from
// outside a literal, then we need to throw in an arbitrary space that
// is required by XML.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
fCharBuf[fCharsAvail++] = chSpace;
// Calculate fCharOfsBuf buffer using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
fCharOfsBuf[0] = 0;
for (XMLSize_t index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
}
}
}