in enterprise/web.core.syntax/src/org/netbeans/modules/web/core/syntax/deprecated/HtmlSyntax.java [114:621]
protected TokenID parseToken() {
char actChar;
while(offset < stopOffset) {
actChar = buffer[offset];
//System.out.println("HtmlSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
// ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
switch( state ) {
case INIT: // DONE
switch( actChar ) {
case '<':
state = ISA_LT;
break;
case '&':
state = ISA_REF;
subState = ISI_TEXT;
break;
default:
state = ISI_TEXT;
break;
}
break;
case ISI_TEXT: // DONE
switch( actChar ) {
case '<':
case '&':
state = INIT;
return HtmlTokenContext.TEXT;
}
break;
case ISI_ERROR: // DONE
offset++;
state = INIT;
return HtmlTokenContext.ERROR;
case ISA_LT: // PENDING other transitions - e.g '<?'
if( isAZ( actChar ) ) { // <'a..Z'
state = ISI_TAG;
return HtmlTokenContext.TAG_OPEN_SYMBOL;
}
switch( actChar ) {
case '/': // ETAGO - </
state = ISA_SLASH;
offset++;
return HtmlTokenContext.TAG_OPEN_SYMBOL;
case '>': // Empty start tag <>, RELAXED
offset++;
state = INIT;
return HtmlTokenContext.TAG_CLOSE_SYMBOL;
case '!':
state = ISA_SGML_ESCAPE;
break;
default: // Part of text, RELAXED
state = ISI_TEXT;
continue; // don't eat the char, maybe its '&'
}
break;
case ISA_SLASH: // DONE
if( isAZ( actChar ) ) { // </'a..Z'
state = ISI_ENDTAG;
break;
}
switch( actChar ) {
case '>': // Empty end tag </>, RELAXED
offset++;
state = INIT;
return HtmlTokenContext.TAG_CLOSE_SYMBOL;
default: // Part of text, e.g. </3, </'\n', RELAXED
state = ISI_TEXT;
continue; // don'e eat the char
}
//break;
case ISI_ENDTAG: // DONE
if( isName( actChar ) ) break; // Still in endtag identifier, eat next char
state = ISP_ENDTAG_X;
return HtmlTokenContext.TAG_CLOSE;
case ISP_ENDTAG_X: // DONE
if( isWS( actChar ) ) {
state = ISP_ENDTAG_WS;
break;
}
switch( actChar ) {
case '>': // Closing of endtag, e.g. </H6 _>_
offset++;
state = INIT;
return HtmlTokenContext.TAG_CLOSE_SYMBOL;
case '<': // next tag, e.g. </H6 _<_, RELAXED
state = INIT;
continue;
default:
state = ISI_ERROR;
continue; //don't eat
}
//break;
case ISP_ENDTAG_WS: // DONE
if( isWS( actChar ) ) break; // eat all WS
state = ISP_ENDTAG_X;
return HtmlTokenContext.WS;
case ISI_TAG: // DONE
if( isName( actChar ) ) break; // Still in tag identifier, eat next char
state = ISP_TAG_X;
return HtmlTokenContext.TAG_OPEN;
case ISP_TAG_X: // DONE
if( isWS( actChar ) ) {
state = ISP_TAG_WS;
break;
}
if( isAZ( actChar ) ) {
state = ISI_ARG;
break;
}
switch( actChar ) {
case '/':
offset++;
state = ISI_TAG_SLASH;
continue;
case '>':
offset++;
state = INIT;
return HtmlTokenContext.TAG_CLOSE_SYMBOL;
case '<':
state = INIT;
continue; // don't eat it!!!
default:
state = ISI_ERROR;
continue;
}
//break;
case ISP_TAG_WS: // DONE
if( isWS( actChar ) ) break; // eat all WS
state = ISP_TAG_X;
return HtmlTokenContext.WS;
case ISI_TAG_SLASH:
switch( actChar ) {
case '>':
offset++;
state = INIT;
return HtmlTokenContext.TAG_CLOSE_SYMBOL;
default:
state = ISI_ERROR;
continue;
}
case ISI_ARG: // DONE
if( isName( actChar ) ) break; // eat next char
state = ISP_ARG_X;
return HtmlTokenContext.ARGUMENT;
case ISP_ARG_X:
if( isWS( actChar ) ) {
state = ISP_ARG_WS;
break;
}
if( isAZ( actChar ) ) {
state = ISI_ARG;
break;
}
switch( actChar ) {
case '/':
case '>':
offset++;
state = INIT;
return HtmlTokenContext.TAG_OPEN;
case '<':
state = INIT;
continue; // don't eat !!!
case '=':
offset++;
state = ISP_EQ;
return HtmlTokenContext.OPERATOR;
default:
state = ISI_ERROR;
continue;
}
//break;
case ISP_ARG_WS:
if( isWS( actChar ) ) break; // Eat all WhiteSpace
state = ISP_ARG_X;
return HtmlTokenContext.WS;
case ISP_EQ:
if( isWS( actChar ) ) {
state = ISP_EQ_WS;
break;
}
switch( actChar ) {
case '\'':
state = ISI_VAL_QUOT;
break;
case '"':
state = ISI_VAL_DQUOT;
break;
case '>':
offset++;
state = INIT;
return HtmlTokenContext.TAG_OPEN;
default:
state = ISI_VAL; //everything else if attribute value
break;
}
break;
case ISP_EQ_WS:
if( isWS( actChar ) ) break; // Consume all WS
state = ISP_EQ;
return HtmlTokenContext.WS;
case ISI_VAL:
if( !isWS( actChar )
&& !(actChar == '/' || actChar == '>' || actChar == '<')) break; // Consume whole value
state = ISP_TAG_X;
return HtmlTokenContext.VALUE;
case ISI_VAL_QUOT:
switch( actChar ) {
case '\'':
offset++;
state = ISP_TAG_X;
return HtmlTokenContext.VALUE;
case '&':
if( offset == tokenOffset ) {
subState = state;
state = ISA_REF;
break;
} else {
return HtmlTokenContext.VALUE;
}
}
break; // else simply consume next char of VALUE
case ISI_VAL_DQUOT:
switch( actChar ) {
case '"':
offset++;
state = ISP_TAG_X;
return HtmlTokenContext.VALUE;
case '&':
if( offset == tokenOffset ) {
subState = state;
state = ISA_REF;
break;
} else {
return HtmlTokenContext.VALUE;
}
}
break; // else simply consume next char of VALUE
case ISA_SGML_ESCAPE: // DONE
if( isAZ(actChar) ) {
state = ISI_SGML_DECL;
break;
}
switch( actChar ) {
case '-':
state = ISA_SGML_DASH;
break;
default:
state = ISI_TEXT;
continue;
}
break;
case ISA_SGML_DASH: // DONE
switch( actChar ) {
case '-':
state = ISI_HTML_COMMENT;
break;
default:
state = ISI_TEXT;
continue;
}
break;
case ISI_HTML_COMMENT: // DONE
switch( actChar ) {
case '-':
state = ISA_HTML_COMMENT_DASH;
break;
//create an HTML comment token for each line of the comment - a performance fix for #43532
case '\n':
offset++;
//leave the some state - we are still in an HTML comment,
//we just need to create a token for each line.
return HtmlTokenContext.BLOCK_COMMENT;
}
break;
case ISA_HTML_COMMENT_DASH:
switch( actChar ) {
case '-':
state = ISI_HTML_COMMENT_WS;
break;
default:
state = ISI_HTML_COMMENT;
continue;
}
break;
case ISI_HTML_COMMENT_WS: // DONE
if( isWS( actChar ) ) break; // Consume all WS
switch( actChar ) {
case '>':
offset++;
state = INIT;
return HtmlTokenContext.BLOCK_COMMENT;
default:
state = ISI_HTML_COMMENT;
continue;
}
//break;
case ISI_SGML_DECL:
switch( actChar ) {
case '>':
offset++;
state = INIT;
return HtmlTokenContext.DECLARATION;
case '-':
if( offset == tokenOffset ) {
state = ISA_SGML_DECL_DASH;
break;
} else {
return HtmlTokenContext.DECLARATION;
}
}
break;
case ISA_SGML_DECL_DASH:
if( actChar == '-' ) {
state = ISI_SGML_COMMENT;
break;
} else {
state = ISI_SGML_DECL;
continue;
}
case ISI_SGML_COMMENT:
switch( actChar ) {
case '-':
state = ISA_SGML_COMMENT_DASH;
break;
}
break;
case ISA_SGML_COMMENT_DASH:
if( actChar == '-' ) {
offset++;
state = ISI_SGML_DECL;
return HtmlTokenContext.SGML_COMMENT;
} else {
state = ISI_SGML_COMMENT;
continue;
}
case ISA_REF:
if( isAZ( actChar ) ) {
state = ISI_REF_NAME;
break;
}
if( actChar == '#' ) {
state = ISA_REF_HASH;
break;
}
state = subState;
continue;
case ISI_REF_NAME:
if( isName( actChar ) ) break;
if( actChar == ';' ) offset++;
state = subState;
return HtmlTokenContext.CHARACTER;
case ISA_REF_HASH:
if( actChar >= '0' && actChar <= '9' ) {
state = ISI_REF_DEC;
break;
}
if( actChar == 'x' || actChar == 'X' ) {
state = ISA_REF_X;
break;
}
if( isAZ( actChar ) ) {
offset++;
state = subState;
return HtmlTokenContext.ERROR;
}
state = subState;
continue;
case ISI_REF_DEC:
if( actChar >= '0' && actChar <= '9' ) break;
if( actChar == ';' ) offset++;
state = subState;
return HtmlTokenContext.CHARACTER;
case ISA_REF_X:
if( (actChar >= '0' && actChar <= '9') ||
(actChar >= 'a' && actChar <= 'f') ||
(actChar >= 'A' && actChar <= 'F')
) {
state = ISI_REF_HEX;
break;
}
state = subState;
return HtmlTokenContext.ERROR; // error on previous "&#x" sequence
case ISI_REF_HEX:
if( (actChar >= '0' && actChar <= '9') ||
(actChar >= 'a' && actChar <= 'f') ||
(actChar >= 'A' && actChar <= 'F')
) break;
if( actChar == ';' ) offset++;
state = subState;
return HtmlTokenContext.CHARACTER;
}
offset = ++offset;
} // end of while(offset...)
/** At this stage there's no more text in the scanned buffer.
* Scanner first checks whether this is completely the last
* available buffer.
*/
if( lastBuffer ) {
switch( state ) {
case INIT:
case ISI_TEXT:
case ISA_LT:
case ISA_SLASH:
case ISA_SGML_ESCAPE:
case ISA_SGML_DASH:
case ISI_TAG_SLASH:
return HtmlTokenContext.TEXT;
case ISA_REF:
case ISA_REF_HASH:
if( subState == ISI_TEXT ) return HtmlTokenContext.TEXT;
else return HtmlTokenContext.VALUE;
case ISI_HTML_COMMENT:
case ISA_HTML_COMMENT_DASH:
case ISI_HTML_COMMENT_WS:
return HtmlTokenContext.BLOCK_COMMENT;
case ISI_TAG:
return HtmlTokenContext.TAG_OPEN;
case ISI_ENDTAG:
return HtmlTokenContext.TAG_CLOSE;
case ISI_ARG:
return HtmlTokenContext.ARGUMENT;
case ISI_ERROR:
return HtmlTokenContext.ERROR;
case ISP_ARG_WS:
case ISP_TAG_WS:
case ISP_ENDTAG_WS:
case ISP_EQ_WS:
return HtmlTokenContext.WS;
case ISP_ARG_X:
case ISP_TAG_X:
case ISP_ENDTAG_X:
case ISP_EQ:
return HtmlTokenContext.WS;
case ISI_VAL:
case ISI_VAL_QUOT:
case ISI_VAL_DQUOT:
return HtmlTokenContext.VALUE;
case ISI_SGML_DECL:
case ISA_SGML_DECL_DASH:
return HtmlTokenContext.DECLARATION;
case ISI_SGML_COMMENT:
case ISA_SGML_COMMENT_DASH:
return HtmlTokenContext.SGML_COMMENT;
case ISI_REF_NAME:
case ISI_REF_DEC:
case ISA_REF_X:
case ISI_REF_HEX:
return HtmlTokenContext.CHARACTER;
}
}
return null;
}