in src/xercesc/validators/common/DFAContentModel.cpp [637:1338]
void DFAContentModel::buildDFA(ContentSpecNode* const curNode)
{
unsigned int index;
//
// The first step we need to take is to rewrite the content model using
// our CMNode objects, and in the process get rid of any repetition short
// cuts, converting them into '*' style repetitions or getting rid of
// repetitions altogether.
//
// The conversions done are:
//
// x+ -> (x|x*)
// x? -> (x|epsilon)
//
// This is a relatively complex scenario. What is happening is that we
// create a top level binary node of which the special EOC value is set
// as the right side node. The the left side is set to the rewritten
// syntax tree. The source is the original content model info from the
// decl pool. The rewrite is done by buildSyntaxTree() which recurses the
// decl pool's content of the element and builds a new tree in the
// process.
//
// Note that, during this operation, we set each non-epsilon leaf node's
// DFA state position and count the number of such leafs, which is left
// in the fLeafCount member.
//
fLeafCount=countLeafNodes(curNode);
// Avoid integer overflow in below fLeafCount++ increment
if (fLeafCount > (std::numeric_limits<unsigned int>::max() - 1))
throw OutOfMemoryException();
fEOCPos = fLeafCount++;
// Avoid integer overflow in below memory allocation
if (fLeafCount > (std::numeric_limits<size_t>::max() / sizeof(CMLeaf*)))
throw OutOfMemoryException();
// We need to build an array of references to the non-epsilon
// leaf nodes. We will put them in the array according to their position values
//
fLeafList = (CMLeaf**) fMemoryManager->allocate(fLeafCount*sizeof(CMLeaf*)); //new CMLeaf*[fLeafCount];
memset(fLeafList, 0, fLeafCount*sizeof(CMLeaf*));
fLeafListType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
(
fLeafCount * sizeof(ContentSpecNode::NodeTypes)
); //new ContentSpecNode::NodeTypes[fLeafCount];
//
// And, moving onward... We now need to build the follow position sets
// for all the nodes. So we allocate an array of pointers to state sets,
// one for each leaf node (i.e. each significant DFA position.)
//
fFollowList = (CMStateSet**) fMemoryManager->allocate
(
fLeafCount * sizeof(CMStateSet*)
); //new CMStateSet*[fLeafCount];
memset(fFollowList, 0, fLeafCount*sizeof(CMStateSet*));
for (index = 0; index < fLeafCount; index++)
fFollowList[index] = new (fMemoryManager) CMStateSet(fLeafCount, fMemoryManager);
// The buildSyntaxTree function will recursively iterate over the ContentSpecNode
// and build the CMNode hierarchy; it will also put every leaf node in the fLeafList
// array, then calculate the first and last position sets of each node. This is
// cached away in each of the nodes.
//
// Along the way we also set the leaf count in each node as the maximum
// state count. They must know this in order to create their first/last
// position sets.
//
unsigned int counter=0;
CMNode* nodeOrgContent = buildSyntaxTree(curNode, counter);
//
// Check to see whether this content model can handle an empty content,
// which is something we need to optimize by looking now before we
// throw away the info that would tell us that.
//
// If the left node of the head (the top level of the original content)
// is nullable, then its true.
//
fEmptyOk = nodeOrgContent->isNullable();
//
// And handle specially the EOC node, which also must be numbered and
// counted as a non-epsilon leaf node. It could not be handled in the
// above tree build because it was created before all that started. We
// save the EOC position since its used during the DFA building loop.
//
CMLeaf* nodeEOC = new (fMemoryManager) CMLeaf
(
new (fMemoryManager) QName
(
XMLUni::fgZeroLenString
, XMLUni::fgZeroLenString
, XMLContentModel::gEOCFakeId
, fMemoryManager
)
, fEOCPos
, true
, fLeafCount
, fMemoryManager
);
fHeadNode = new (fMemoryManager) CMBinaryOp
(
ContentSpecNode::Sequence
, nodeOrgContent
, nodeEOC
, fLeafCount
, fMemoryManager
);
// Put also the final EOC node in the leaf array
fLeafList[counter] = new (fMemoryManager) CMLeaf
(
nodeEOC->getElement()
, nodeEOC->getPosition()
, fLeafCount
, fMemoryManager
);
fLeafListType[counter] = ContentSpecNode::Leaf;
//
// Now handle our top level. We use our left child's last pos set and our
// right child's first pos set, so get them now for convenience.
//
const CMStateSet& last = nodeOrgContent->getLastPos();
const CMStateSet& first = nodeEOC->getFirstPos();
//
// Now, for every position which is in our left child's last set
// add all of the states in our right child's first set to the
// follow set for that position.
//
CMStateSetEnumerator enumLast(&last);
while(enumLast.hasMoreElements())
{
XMLSize_t index=enumLast.nextElement();
*fFollowList[index] |= first;
}
//
// And finally the big push... Now we build the DFA using all the states
// and the tree we've built up. First we set up the various data
// structures we are going to use while we do this.
//
// First of all we need an array of unique element ids in our content
// model. For each transition table entry, we need a set of contiguous
// indices to represent the transitions for a particular input element.
// So we need to a zero based range of indexes that map to element types.
// This element map provides that mapping.
//
fElemMap = (QName**) fMemoryManager->allocate
(
fLeafCount * sizeof(QName*)
); //new QName*[fLeafCount];
fElemMapType = (ContentSpecNode::NodeTypes*) fMemoryManager->allocate
(
fLeafCount * sizeof(ContentSpecNode::NodeTypes)
); //new ContentSpecNode::NodeTypes[fLeafCount];
fElemMapSize = 0;
Occurence** elemOccurenceMap=0;
for (unsigned int outIndex = 0; outIndex < fLeafCount; outIndex++)
{
fElemMap[outIndex] = new (fMemoryManager) QName(fMemoryManager);
if ( (fLeafListType[outIndex] & 0x0f) != ContentSpecNode::Leaf )
if (!fLeafNameTypeVector)
fLeafNameTypeVector = new (fMemoryManager) ContentLeafNameTypeVector(fMemoryManager);
// Get the current leaf's element index
CMLeaf* leaf=fLeafList[outIndex];
const QName* element = leaf->getElement();
const XMLCh* elementRawName = 0;
if (fDTD && element)
elementRawName = element->getRawName();
// See if the current leaf node's element index is in the list
unsigned int inIndex = 0;
for (; inIndex < fElemMapSize; inIndex++)
{
const QName* inElem = fElemMap[inIndex];
if (fDTD) {
if (XMLString::equals(inElem->getRawName(), elementRawName)) {
break;
}
}
else {
if ((fElemMapType[inIndex] == fLeafListType[outIndex]) &&
(inElem->getURI() == element->getURI()) &&
(XMLString::equals(inElem->getLocalPart(), element->getLocalPart()))) {
break;
}
}
}
// If it was not in the list, then add it and bump the map size
if (inIndex == fElemMapSize)
{
fElemMap[fElemMapSize]->setValues(*element);
if(leaf->isRepeatableLeaf())
{
if (elemOccurenceMap == 0) {
elemOccurenceMap = (Occurence**)fMemoryManager->allocate(fLeafCount*sizeof(Occurence*));
memset(elemOccurenceMap, 0, fLeafCount*sizeof(Occurence*));
}
elemOccurenceMap[fElemMapSize] = new (fMemoryManager) Occurence(((CMRepeatingLeaf*)leaf)->getMinOccurs(), ((CMRepeatingLeaf*)leaf)->getMaxOccurs(), fElemMapSize);
}
fElemMapType[fElemMapSize] = fLeafListType[outIndex];
++fElemMapSize;
}
}
// set up the fLeafNameTypeVector object if there is one.
if (fLeafNameTypeVector) {
fLeafNameTypeVector->setValues(fElemMap, fElemMapType, fElemMapSize);
}
/***
* Optimization(Jan, 2001); We sort fLeafList according to
* elemIndex which is *uniquely* associated to each leaf.
* We are *assuming* that each element appears in at least one leaf.
**/
// don't forget to delete it
#ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
int *leafSorter = (int*) fMemoryManager->allocate
(
(fLeafCount + fElemMapSize) * sizeof(int)
); //new int[fLeafCount + fElemMapSize];
unsigned int fSortCount = 0;
for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
{
const QName* element = fElemMap[elemIndex];
const XMLCh* elementRawName = 0;
if (fDTD && element)
elementRawName = element->getRawName();
for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
{
const QName* leaf = fLeafList[leafIndex]->getElement();
if (fDTD) {
if (XMLString::equals(leaf->getRawName(), elementRawName)) {
leafSorter[fSortCount++] = leafIndex;
}
}
else {
if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
(leaf->getURI() == element->getURI()) &&
(XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
leafSorter[fSortCount++] = leafIndex;
}
}
}
leafSorter[fSortCount++] = -1;
}
#endif
// instead of using a single array with -1 to separate elements, use a bidimensional map
unsigned int** leafSorter = (unsigned int**)fMemoryManager->allocate(fElemMapSize * sizeof(unsigned int*));
unsigned int* tmpSorter = (unsigned int*)fMemoryManager->allocate(fLeafCount * sizeof(unsigned int));
for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
{
const QName* element = fElemMap[elemIndex];
const XMLCh* elementRawName = 0;
if (fDTD && element)
elementRawName = element->getRawName();
unsigned int fSortCount=0;
for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
{
const QName* leaf = fLeafList[leafIndex]->getElement();
if (fDTD) {
if (XMLString::equals(leaf->getRawName(), elementRawName)) {
tmpSorter[fSortCount++] = leafIndex;
}
}
else {
if ((fElemMapType[elemIndex] == fLeafListType[leafIndex]) &&
(leaf->getURI() == element->getURI()) &&
(XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
tmpSorter[fSortCount++] = leafIndex;
}
}
}
leafSorter[elemIndex]=(unsigned int*)fMemoryManager->allocate((fSortCount+1) * sizeof(unsigned int));
leafSorter[elemIndex][0]=fSortCount;
for (unsigned int index=0;index<fSortCount;index++)
leafSorter[elemIndex][index+1]=tmpSorter[index];
}
fMemoryManager->deallocate(tmpSorter);
//
// Next lets create some arrays, some that that hold transient info
// during the DFA build and some that are permament. These are kind of
// sticky since we cannot know how big they will get, but we don't want
// to use any collection type classes because of performance.
//
// Basically they will probably be about fLeafCount*2 on average, but can
// be as large as 2^(fLeafCount*2), worst case. So we start with
// fLeafCount*4 as a middle ground. This will be very unlikely to ever
// have to expand though, it if does, the overhead will be somewhat ugly.
//
unsigned int curArraySize = fLeafCount * 4;
CMStateSet** statesToDo = (CMStateSet**)
fMemoryManager->allocate
(
curArraySize * sizeof(CMStateSet*)
); //new const CMStateSet*[curArraySize];
fFinalStateFlags = (bool*) fMemoryManager->allocate
(
curArraySize * sizeof(bool)
); //new bool[curArraySize];
fTransTable = (unsigned int**) fMemoryManager->allocate
(
curArraySize * sizeof(unsigned int*)
); //new unsigned int*[curArraySize];
//
// Ok we start with the initial set as the first pos set of the head node
// (which is the seq node that holds the content model and the EOC node.)
//
CMStateSet* setT = new (fMemoryManager) CMStateSet(fHeadNode->getFirstPos());
//
// Note on memory leak: Bugzilla#2707:
// ===================================
// The CMBinary, pointed to by fHeadNode, shall be released by
// deleted by itself.
//
// fLeafList[] maintains its **OWN** copy of CMLeaf to avoid double deletion
// of CMLeaf.
//
delete fHeadNode;
//
// Init our two state flags. Basically the unmarked state counter is
// always chasing the current state counter. When it catches up, that
// means we made a pass through that did not add any new states to the
// lists, at which time we are done. We could have used a expanding array
// of flags which we used to mark off states as we complete them, but
// this is easier though less readable maybe.
//
unsigned int unmarkedState = 0;
unsigned int curState = 0;
//
// Init the first transition table entry, and put the initial state
// into the states to do list, then bump the current state.
//
fTransTable[curState] = makeDefStateList();
statesToDo[curState] = setT;
curState++;
//
// the stateTable is an auxiliary means to fast
// identification of new state created (instead
// of sequential loop statesToDo to find out),
// while the role that statesToDo plays remain unchanged.
//
RefHashTableOf<XMLInteger, CMStateSetHasher> *stateTable =
new (fMemoryManager) RefHashTableOf<XMLInteger, CMStateSetHasher>
(
curArraySize
, true
, fMemoryManager
);
//stateTable->put((CMStateSet*)setT, new (fMemoryManager) XMLInteger(0));
//
// Ok, almost done with the algorithm from hell... We now enter the
// loop where we go until the states done counter catches up with
// the states to do counter.
//
CMStateSet* newSet = 0;
while (unmarkedState < curState)
{
//
// Get the next unmarked state out of the list of states to do.
// And get the associated transition table entry.
//
setT = statesToDo[unmarkedState];
unsigned int* transEntry = fTransTable[unmarkedState];
// Mark this one final if it contains the EOC state
fFinalStateFlags[unmarkedState] = setT->getBit(fEOCPos);
// Bump up the unmarked state count, marking this state done
unmarkedState++;
#ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
// Optimization(Jan, 2001)
unsigned int sorterIndex = 0;
// Optimization(Jan, 2001)
#endif
// Loop through each possible input symbol in the element map
for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
{
//
// Build up a set of states which is the union of all of the
// follow sets of DFA positions that are in the current state. If
// we gave away the new set last time through then create a new
// one. Otherwise, zero out the existing one.
//
if (!newSet)
newSet = new (fMemoryManager) CMStateSet
(
fLeafCount
, fMemoryManager
);
else
newSet->zeroBits();
#ifdef OBSOLETED
// unoptimized code
for (unsigned int leafIndex = 0; leafIndex < fLeafCount; leafIndex++)
{
// If this leaf index (DFA position) is in the current set...
if (setT->getBit(leafIndex))
{
//
// If this leaf is the current input symbol, then we want
// to add its follow list to the set of states to transition
// to from the current state.
//
const QName* leaf = fLeafList[leafIndex]->getElement();
const QName* element = fElemMap[elemIndex];
if (fDTD) {
if (XMLString::equals(leaf->getRawName(), element->getRawName())) {
*newSet |= *fFollowList[leafIndex];
}
}
else {
if ((leaf->getURI() == element->getURI()) &&
(XMLString::equals(leaf->getLocalPart(), element->getLocalPart()))) {
*newSet |= *fFollowList[leafIndex];
}
}
}
} // for leafIndex
#endif
#ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
// Optimization(Jan, 2001)
int leafIndex = leafSorter[sorterIndex++];
while (leafIndex != -1)
{
// If this leaf index (DFA position) is in the current set...
if (setT->getBit(leafIndex))
{
//
// If this leaf is the current input symbol, then we
// want to add its follow list to the set of states to
// transition to from the current state.
//
*newSet |= *fFollowList[leafIndex];
}
leafIndex = leafSorter[sorterIndex++];
} // while (leafIndex != -1)
#endif
unsigned int* fLeafIndexes=leafSorter[elemIndex];
unsigned int fNumItems=fLeafIndexes[0];
if(fNumItems!=0)
{
// The algorithm requires finding the leaf that is present both in the bitfield of the current state, and in the
// list of places where the currently tested item can appear. When this occurs, the follow list of this parent item
// is added to the bitfield representing the next state.
// Both the bitfield and the list of places are sorted, so we can analyze them in two ways; either iterating over the
// parent items, testing the bitfield for the existence of the parent (N times a constant Tb), or by iterating over the
// bitfield (restricted to the range of the sorted list of places), using a binary search to locate the leaf in the
// sorted list of places (M times log(N) testing operations Ts)
// Assuming that the time to test a bit is roughly the same of the time needed to compute the average of two integers,
// plus a couple of comparisons and additions, we compare N agains M*log(N) to decide which algorithm should be faster given
// the two sets
if(fNumItems <= setT->getBitCountInRange(fLeafIndexes[1], fLeafIndexes[fNumItems])*log((float)fNumItems))
{
for(unsigned int i=1; i<=fNumItems; ++i)
if(setT->getBit(fLeafIndexes[i]))
{
//
// If this leaf is the current input symbol, then we
// want to add its follow list to the set of states to
// transition to from the current state.
//
*newSet |= *fFollowList[ fLeafIndexes[i] ];
}
}
else
{
// Further optimization: given that the bitfield enumerator returns the numbers in order,
// every time we raise the lower marker we know it will true also for the next bits, so
// the next binary search will not start from 1 but from this index
unsigned int lowIndex = 1;
// Start the enumerator from the first index in the sorted list of places,
// as nothing before that point will match
CMStateSetEnumerator enumBits(setT, fLeafIndexes[1]);
while(enumBits.hasMoreElements())
{
unsigned int bitIndex=enumBits.nextElement();
// if this leaf is greater than the last index in the sorted list of places,
// nothing can be found from now on, so get out of here
if(bitIndex > fLeafIndexes[fNumItems])
break;
// Check if this leaf index (DFA position) is in the current set
// (using binary search: the indexes are sorted)
unsigned int first=lowIndex,last=fNumItems,i;
while(first<=last)
{
i=(first+last)/2;
if(fLeafIndexes[i]>bitIndex)
last=i-1;
else if(fLeafIndexes[i]<bitIndex)
lowIndex=first=i+1;
else
{
//
// If this leaf is the current input symbol, then we
// want to add its follow list to the set of states to
// transition to from the current state.
//
*newSet |= *fFollowList[bitIndex];
break;
}
}
}
}
}
//
// If this new set is not empty, then see if its in the list
// of states to do. If not, then add it.
//
if (!newSet->isEmpty())
{
//
// Search the 'states to do' list to see if this new
// state set is already in there.
//
/***
unsigned int stateIndex = 0;
for (; stateIndex < curState; stateIndex++)
{
if (*statesToDo[stateIndex] == *newSet)
break;
}
***/
XMLInteger *stateObj = stateTable->get(newSet);
unsigned int stateIndex = (stateObj == 0 ? curState : stateObj->intValue());
// If we did not find it, then add it
if (stateIndex == curState)
{
//
// Put this new state into the states to do and init
// a new entry at the same index in the transition
// table.
//
statesToDo[curState] = newSet;
fTransTable[curState] = makeDefStateList();
stateTable->put
(
newSet
, new (fMemoryManager) XMLInteger(curState)
);
// We now have a new state to do so bump the count
curState++;
//
// Null out the new set to indicate we adopted it. This
// will cause the creation of a new set on the next time
// around the loop.
//
newSet = 0;
}
//
// Now set this state in the transition table's entry for this
// element (using its index), with the DFA state we will move
// to from the current state when we see this input element.
//
transEntry[elemIndex] = stateIndex;
// Expand the arrays if we're full
if (curState == curArraySize)
{
//
// Yikes, we overflowed the initial array size, so we've
// got to expand all of these arrays. So adjust up the
// size by 50% and allocate new arrays.
//
const unsigned int newSize = (unsigned int)(curArraySize * 1.5);
CMStateSet** newToDo = (CMStateSet**)
fMemoryManager->allocate
(
newSize * sizeof(CMStateSet*)
); //new const CMStateSet*[newSize];
bool* newFinalFlags = (bool*) fMemoryManager->allocate
(
newSize * sizeof(bool)
); //new bool[newSize];
unsigned int** newTransTable = (unsigned int**)
fMemoryManager->allocate
(
newSize * sizeof(unsigned int*)
); //new unsigned int*[newSize];
// Copy over all of the existing content
for (unsigned int expIndex = 0; expIndex < curArraySize; expIndex++)
{
newToDo[expIndex] = statesToDo[expIndex];
newFinalFlags[expIndex] = fFinalStateFlags[expIndex];
newTransTable[expIndex] = fTransTable[expIndex];
}
// Clean up the old stuff
fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
fMemoryManager->deallocate(fFinalStateFlags); //delete [] fFinalStateFlags;
fMemoryManager->deallocate(fTransTable); //delete [] fTransTable;
// Store the new array size and pointers
curArraySize = newSize;
statesToDo = newToDo;
fFinalStateFlags = newFinalFlags;
fTransTable = newTransTable;
} //if (curState == curArraySize)
} //if (!newSet->isEmpty())
} // for elemIndex
} //while
// Store the current state count in the trans table size
fTransTableSize = curState;
//
// Fill in the occurence information for each looping state
// if we're using counters.
//
if (elemOccurenceMap != 0) {
fCountingStates = (Occurence**)fMemoryManager->allocate(fTransTableSize*sizeof(Occurence*));
memset(fCountingStates, 0, fTransTableSize*sizeof(Occurence*));
for (unsigned int i = 0; i < fTransTableSize; ++i) {
unsigned int * transitions = fTransTable[i];
for (unsigned int j = 0; j < fElemMapSize; ++j) {
if (i == transitions[j]) {
Occurence* old=elemOccurenceMap[j];
if(old!=0)
fCountingStates[i] = new (fMemoryManager) Occurence(old->minOccurs, old->maxOccurs, old->elemIndex);
break;
}
}
}
for (unsigned int j = 0; j < fLeafCount; ++j) {
if(elemOccurenceMap[j]!=0)
delete elemOccurenceMap[j];
}
fMemoryManager->deallocate(elemOccurenceMap);
}
// If the last temp set was not stored, then clean it up
if (newSet)
delete newSet;
//
// Now we can clean up all of the temporary data that was needed during
// DFA build.
//
for (index = 0; index < fLeafCount; index++)
delete fFollowList[index];
fMemoryManager->deallocate(fFollowList); //delete [] fFollowList;
fFollowList = NULL;
//
// removeAll() will delete all data, XMLInteger,
// while the keys are to be deleted by the
// deletion of statesToDo.
//
delete stateTable;
for (index = 0; index < curState; index++)
delete statesToDo[index];
fMemoryManager->deallocate(statesToDo); //delete [] statesToDo;
for (index = 0; index < fLeafCount; index++)
delete fLeafList[index];
fMemoryManager->deallocate(fLeafList); //delete [] fLeafList;
fLeafList = NULL;
#ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH
fMemoryManager->deallocate(leafSorter); //delete [] leafSorter;
#endif
for (index=0; index < fElemMapSize; index++)
fMemoryManager->deallocate(leafSorter[index]);
fMemoryManager->deallocate(leafSorter);
}