in framework/connector-common/src/main/java/org/apache/manifoldcf/connectorcommon/fuzzyml/TagParseState.java [126:938]
public boolean dealWithCharacter(char thisChar)
throws ManifoldCFException
{
// At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
// We don't even attempt to map to lower case, that's how naive this is.
switch (currentState)
{
case TAGPARSESTATE_NORMAL:
if (thisChar == '<')
{
if (inAmpersand)
{
outputAmpBuffer();
inAmpersand = false;
}
currentState = TAGPARSESTATE_SAWLEFTANGLE;
}
else if (bTagDepth > 0 && thisChar == '>')
{
// Output current token, if any
if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
if (noteBTagToken(currentTagName))
return true;
currentTagName = null;
currentTagNameBuffer = null;
}
if (noteEndBTag())
return true;
bTagDepth--;
}
else if (bTagDepth == 0)
{
if (inAmpersand)
{
if (thisChar == ';')
{
// We append the semi so that the output function can make good decisions
ampBuffer.append(thisChar);
if (outputAmpBuffer())
return true;
inAmpersand = false;
}
else if (isWhitespace(thisChar))
{
// Interpret ampersand buffer.
if (outputAmpBuffer())
return true;
inAmpersand = false;
if (noteNormalCharacter(thisChar))
return true;
}
else
ampBuffer.append(thisChar);
}
else if (thisChar == '&')
{
inAmpersand = true;
ampBuffer.setLength(0);
}
else
{
if (noteNormalCharacter(thisChar))
return true;
}
}
else
{
// In btag; accumulate tokens
if (isPunctuation(thisChar))
{
if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
if (noteBTagToken(currentTagName))
return true;
currentTagNameBuffer = null;
currentTagName = null;
}
if (noteBTagToken(new StringBuilder().append(thisChar).toString()))
return true;
}
else if (isWhitespace(thisChar))
{
if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
if (noteBTagToken(currentTagName))
return true;
currentTagNameBuffer = null;
currentTagName = null;
}
}
else
{
if (currentTagNameBuffer == null)
currentTagNameBuffer = newBuffer();
currentTagNameBuffer.append(thisChar);
}
}
break;
case TAGPARSESTATE_IN_CDATA_BODY:
if (thisChar == ']')
currentState = TAGPARSESTATE_SAWRIGHTBRACKET;
else
{
if (noteEscapedCharacter(thisChar))
return true;
}
break;
case TAGPARSESTATE_SAWRIGHTBRACKET:
if (thisChar == ']')
currentState = TAGPARSESTATE_SAWSECONDRIGHTBRACKET;
else
{
currentState = TAGPARSESTATE_IN_CDATA_BODY;
if (noteEscapedCharacter(']'))
return true;
if (noteEscapedCharacter(thisChar))
return true;
}
break;
case TAGPARSESTATE_SAWSECONDRIGHTBRACKET:
if (thisChar == '>')
currentState = TAGPARSESTATE_NORMAL;
else if (thisChar == ']')
{
// currentstate unchanged; emit the first bracket
if (noteEscapedCharacter(']'))
return true;
}
else
{
currentState = TAGPARSESTATE_IN_CDATA_BODY;
if (noteEscapedCharacter(']'))
return true;
if (noteEscapedCharacter(']'))
return true;
if (noteEscapedCharacter(thisChar))
return true;
}
break;
case TAGPARSESTATE_SAWLEFTANGLE:
if (thisChar == '!')
currentState = TAGPARSESTATE_SAWEXCLAMATION;
else if (thisChar == '?')
{
currentState = TAGPARSESTATE_IN_QTAG_NAME;
currentTagNameBuffer = newBuffer();
}
else if (bTagDepth == 0 && thisChar == '/')
{
currentState = TAGPARSESTATE_IN_END_TAG_NAME;
currentTagNameBuffer = newBuffer();
}
else if (bTagDepth == 0)
{
if (isWhitespace(thisChar) || !acceptNewTag())
{
// Not a tag.
currentState = TAGPARSESTATE_NORMAL;
if (noteNormalCharacter('<'))
return true;
if (noteNormalCharacter(thisChar))
return true;
}
else
{
currentState = TAGPARSESTATE_IN_TAG_NAME;
currentTagNameBuffer = newBuffer();
currentTagNameBuffer.append(thisChar);
}
}
else
{
// in btag, saw left angle, nothing recognizable after - must be a token
if (noteBTagToken("<"))
return true;
if (!isWhitespace(thisChar))
{
// Add char to current token buffer.
currentTagNameBuffer = newBuffer();
currentTagNameBuffer.append(thisChar);
}
currentState = TAGPARSESTATE_NORMAL;
}
break;
case TAGPARSESTATE_SAWEXCLAMATION:
if (thisChar == '-')
currentState = TAGPARSESTATE_SAWDASH;
else if (thisChar == '[')
{
currentState = TAGPARSESTATE_IN_BRACKET_TOKEN;
currentTagNameBuffer = newBuffer();
}
else
{
bTagDepth++;
currentState = TAGPARSESTATE_IN_BANG_TOKEN;
currentTagNameBuffer = newBuffer();
if (!isWhitespace(thisChar))
currentTagNameBuffer.append(thisChar);
}
break;
case TAGPARSESTATE_SAWDASH:
if (thisChar == '-')
currentState = TAGPARSESTATE_IN_COMMENT;
else
currentState = TAGPARSESTATE_NORMAL;
break;
case TAGPARSESTATE_IN_COMMENT:
// We're in a comment. All we should look for is the end of the comment.
if (thisChar == '-')
currentState = TAGPARSESTATE_SAWCOMMENTDASH;
break;
case TAGPARSESTATE_SAWCOMMENTDASH:
if (thisChar == '-')
currentState = TAGPARSESTATE_SAWSECONDCOMMENTDASH;
else
currentState = TAGPARSESTATE_IN_COMMENT;
break;
case TAGPARSESTATE_SAWSECONDCOMMENTDASH:
if (thisChar == '>')
currentState = TAGPARSESTATE_NORMAL;
else if (thisChar != '-')
currentState = TAGPARSESTATE_IN_COMMENT;
break;
case TAGPARSESTATE_IN_QTAG_NAME:
if (isWhitespace(thisChar))
{
if (currentTagNameBuffer.length() > 0)
{
// Done with the tag name!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
}
else if (thisChar == '?')
{
if (currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
if (noteQTag(currentTagName,currentAttrList))
return true;
}
else
{
currentState = TAGPARSESTATE_NORMAL;
currentTagNameBuffer = null;
}
}
else if (thisChar == '>')
{
if (currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
}
if (currentTagName != null)
{
if (noteQTag(currentTagName,currentAttrList))
return true;
}
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
currentAttrList = null;
}
else
currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_BRACKET_TOKEN:
if (isWhitespace(thisChar))
{
if (currentTagNameBuffer.length() > 0)
{
// Done with the bracket token!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentState = TAGPARSESTATE_NEED_FINAL_BRACKET;
}
}
else if (thisChar == '[')
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentState = TAGPARSESTATE_IN_CDATA_BODY;
if (noteEscaped(currentTagName))
return true;
currentTagName = null;
}
else
currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_NEED_FINAL_BRACKET:
if (thisChar == '[')
{
if (noteEscaped(currentTagName))
return true;
currentTagName = null;
currentState = TAGPARSESTATE_IN_CDATA_BODY;
}
break;
case TAGPARSESTATE_IN_BANG_TOKEN:
if (isWhitespace(thisChar))
{
if (currentTagNameBuffer.length() > 0)
{
// Done with bang token
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
if (noteBTag(currentTagName))
return true;
currentTagName = null;
currentState = TAGPARSESTATE_NORMAL;
}
}
else if (thisChar == '>')
{
// Also done, but signal end too.
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
if (noteBTag(currentTagName))
return true;
currentTagName = null;
currentState = TAGPARSESTATE_NORMAL;
if (noteEndBTag())
return true;
bTagDepth--;
}
else
currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_TAG_NAME:
if (isWhitespace(thisChar))
{
if (currentTagNameBuffer.length() > 0)
{
// Done with the tag name!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
}
else if (thisChar == '/')
{
if (currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
if (noteTag(currentTagName,currentAttrList))
return true;
}
else
{
currentState = TAGPARSESTATE_NORMAL;
currentTagNameBuffer = null;
}
}
else if (thisChar == '>')
{
if (currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
currentAttrList = new ArrayList<AttrNameValue>();
}
if (currentTagName != null)
{
if (noteTag(currentTagName,currentAttrList))
return true;
}
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
currentAttrList = null;
}
else
currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_QTAG_ATTR_NAME:
if (isWhitespace(thisChar))
{
if (currentAttrNameBuffer.length() > 0)
{
// Done with attr name!
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
currentState = TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE;
}
}
else if (thisChar == '=')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
currentState = TAGPARSESTATE_IN_QTAG_ATTR_VALUE;
currentValueBuffer = newBuffer();
}
}
else if (thisChar == '?')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
}
if (currentAttrName != null)
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
if (noteQTag(currentTagName,currentAttrList))
return true;
currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
}
else if (thisChar == '>')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
}
if (currentAttrName != null)
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
currentState = TAGPARSESTATE_NORMAL;
if (noteQTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else
currentAttrNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_ATTR_NAME:
if (isWhitespace(thisChar))
{
if (currentAttrNameBuffer.length() > 0)
{
// Done with attr name!
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE;
}
}
else if (thisChar == '=')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_VALUE;
currentValueBuffer = newBuffer();
}
}
else if (thisChar == '/')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
}
if (currentAttrName != null)
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
if (noteTag(currentTagName,currentAttrList))
return true;
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
}
else if (thisChar == '>')
{
if (currentAttrNameBuffer.length() > 0)
{
currentAttrName = currentAttrNameBuffer.toString();
currentAttrNameBuffer = null;
}
if (currentAttrName != null)
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
}
currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else
currentAttrNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_QTAG_ATTR_LOOKING_FOR_VALUE:
if (thisChar == '=')
{
currentState = TAGPARSESTATE_IN_QTAG_ATTR_VALUE;
currentValueBuffer = newBuffer();
}
else if (thisChar == '>')
{
currentState = TAGPARSESTATE_NORMAL;
if (noteQTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else if (thisChar == '?')
{
currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
if (noteQTag(currentTagName,currentAttrList))
return true;
}
else if (!isWhitespace(thisChar))
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
currentAttrNameBuffer.append(thisChar);
currentAttrName = null;
}
break;
case TAGPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
if (thisChar == '=')
{
currentState = TAGPARSESTATE_IN_ATTR_VALUE;
currentValueBuffer = newBuffer();
}
else if (thisChar == '>')
{
currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else if (thisChar == '/')
{
currentState = TAGPARSESTATE_IN_TAG_SAW_SLASH;
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentAttrName = null;
if (noteTag(currentTagName,currentAttrList))
return true;
}
else if (!isWhitespace(thisChar))
{
currentAttrList.add(new AttrNameValue(currentAttrName,""));
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
currentAttrNameBuffer.append(thisChar);
currentAttrName = null;
}
break;
case TAGPARSESTATE_IN_QTAG_ATTR_VALUE:
if (thisChar == '\'')
currentState = TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE;
else if (thisChar == '"')
currentState = TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE;
else if (!isWhitespace(thisChar))
{
currentState = TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE;
currentValueBuffer.append(thisChar);
}
break;
case TAGPARSESTATE_IN_ATTR_VALUE:
if (thisChar == '\'')
currentState = TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
else if (thisChar == '"')
currentState = TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
else if (thisChar == '/')
currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH;
else if (!isWhitespace(thisChar))
{
currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
currentValueBuffer.append(thisChar);
}
break;
case TAGPARSESTATE_IN_QTAG_SAW_QUESTION:
if (thisChar == '>')
{
// No end-tag notification for this one
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
currentAttrList = null;
}
break;
case TAGPARSESTATE_IN_TAG_SAW_SLASH:
if (thisChar == '>')
{
if (noteEndTag(currentTagName))
return true;
currentState = TAGPARSESTATE_NORMAL;
currentTagName = null;
currentAttrList = null;
}
break;
case TAGPARSESTATE_IN_END_TAG_NAME:
if (isWhitespace(thisChar))
{
if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
{
// Done with the tag name!
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
}
}
else if (thisChar == '>')
{
if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
{
currentTagName = currentTagNameBuffer.toString();
currentTagNameBuffer = null;
}
if (currentTagName != null)
{
if (noteEndTag(currentTagName))
return true;
}
currentTagName = null;
currentState = TAGPARSESTATE_NORMAL;
}
else if (currentTagNameBuffer != null)
currentTagNameBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_QTAG_SINGLE_QUOTES_ATTR_VALUE:
if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else
currentValueBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else
currentValueBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_QTAG_DOUBLE_QUOTES_ATTR_VALUE:
if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else
currentValueBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else
currentValueBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_QTAG_UNQUOTED_ATTR_VALUE:
if (isWhitespace(thisChar))
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_QTAG_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else if (thisChar == '?')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
if (noteTag(currentTagName,currentAttrList))
return true;
currentState = TAGPARSESTATE_IN_QTAG_SAW_QUESTION;
}
else if (thisChar == '>')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else
currentValueBuffer.append(thisChar);
break;
case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH:
if (isWhitespace(thisChar))
{
currentValueBuffer.append('/');
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else if (thisChar == '/')
{
currentValueBuffer.append('/');
}
else if (thisChar == '>')
{
currentValueBuffer.append('/');
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else
{
currentValueBuffer.append('/');
currentValueBuffer.append(thisChar);
currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
}
break;
case TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
if (isWhitespace(thisChar))
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_IN_ATTR_NAME;
currentAttrNameBuffer = newBuffer();
}
else if (thisChar == '/')
{
currentState = TAGPARSESTATE_IN_UNQUOTED_ATTR_VALUE_SAW_SLASH;
}
else if (thisChar == '>')
{
currentAttrList.add(new AttrNameValue(currentAttrName,attributeDecode(currentValueBuffer.toString())));
currentAttrName = null;
currentValueBuffer = null;
currentState = TAGPARSESTATE_NORMAL;
if (noteTag(currentTagName,currentAttrList))
return true;
currentTagName = null;
currentAttrList = null;
}
else
currentValueBuffer.append(thisChar);
break;
default:
throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
}
return false;
}