protected TokenID parseToken()

in enterprise/web.core.syntax/src/org/netbeans/modules/web/core/syntax/deprecated/HtmlSyntax.java [114:621]


    protected TokenID parseToken() {
        char actChar;

        while(offset < stopOffset) {
            actChar = buffer[offset];
             //System.out.println("HtmlSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
             //      ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
            switch( state ) {
            case INIT:              // DONE
                switch( actChar ) {
                case '<':
                    state = ISA_LT;
                    break;
                case '&':
                    state = ISA_REF;
                    subState = ISI_TEXT;
                    break;
                default:
                    state = ISI_TEXT;
                    break;
                }
                break;

            case ISI_TEXT:        // DONE
                switch( actChar ) {
                case '<':
                case '&':
                    state = INIT;
                    return HtmlTokenContext.TEXT;
                }
                break;

            case ISI_ERROR:      // DONE
                offset++;
                state = INIT;
                return HtmlTokenContext.ERROR;

            case ISA_LT:         // PENDING other transitions - e.g '<?'
                if( isAZ( actChar ) ) {   // <'a..Z'
                    state = ISI_TAG;
                    return HtmlTokenContext.TAG_OPEN_SYMBOL;
                }
                switch( actChar ) {
                case '/':               // ETAGO - </
                    state = ISA_SLASH;
                    offset++;
                    return HtmlTokenContext.TAG_OPEN_SYMBOL;
                case '>':               // Empty start tag <>, RELAXED
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_CLOSE_SYMBOL;
                case '!':
                    state = ISA_SGML_ESCAPE;
                    break;
                default:                // Part of text, RELAXED
                    state = ISI_TEXT;
                    continue;             // don't eat the char, maybe its '&'
                }
                break;

            case ISA_SLASH:        // DONE
                if( isAZ( actChar ) ) {   // </'a..Z'
                    state = ISI_ENDTAG;
                    break;
                }
                switch( actChar ) {
                case '>':               // Empty end tag </>, RELAXED
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_CLOSE_SYMBOL;
                default:                // Part of text, e.g. </3, </'\n', RELAXED
                    state = ISI_TEXT;
                    continue;             // don'e eat the char
                }
                //break;

            case ISI_ENDTAG:        // DONE
                if( isName( actChar ) ) break;    // Still in endtag identifier, eat next char
                state = ISP_ENDTAG_X;
                return HtmlTokenContext.TAG_CLOSE;


            case ISP_ENDTAG_X:      // DONE
                if( isWS( actChar ) ) {
                    state = ISP_ENDTAG_WS;
                    break;
                }
                switch( actChar ) {
                case '>':               // Closing of endtag, e.g. </H6 _>_
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_CLOSE_SYMBOL;
                case '<':               // next tag, e.g. </H6 _<_, RELAXED
                    state = INIT;
                    continue;
                default:
                    state = ISI_ERROR;
                    continue; //don't eat
                }
                //break;

            case ISP_ENDTAG_WS:      // DONE
                if( isWS( actChar ) ) break;  // eat all WS
                state = ISP_ENDTAG_X;
                return HtmlTokenContext.WS;


            case ISI_TAG:        // DONE
                if( isName( actChar ) ) break;    // Still in tag identifier, eat next char
                state = ISP_TAG_X;
                return HtmlTokenContext.TAG_OPEN;

            case ISP_TAG_X:     // DONE
                if( isWS( actChar ) ) {
                    state = ISP_TAG_WS;
                    break;
                }
                if( isAZ( actChar ) ) {
                    state = ISI_ARG;
                    break;
                }
                switch( actChar ) {
                case '/':
                    offset++;
                    state = ISI_TAG_SLASH;
                    continue;
                case '>':
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_CLOSE_SYMBOL;
                case '<':
                    state = INIT;
                    continue;       // don't eat it!!!
                default:
                    state = ISI_ERROR;
                    continue;
                }
                //break;

            case ISP_TAG_WS:        // DONE
                if( isWS( actChar ) ) break;    // eat all WS
                state = ISP_TAG_X;
                return HtmlTokenContext.WS;

            case ISI_TAG_SLASH:
                switch( actChar ) {
                    case '>':
                        offset++;
                        state = INIT;
                        return HtmlTokenContext.TAG_CLOSE_SYMBOL;
                    default:
                        state = ISI_ERROR;
                        continue;
                }
                
            case ISI_ARG:           // DONE
                if( isName( actChar ) ) break; // eat next char
                state = ISP_ARG_X;
                return HtmlTokenContext.ARGUMENT;

            case ISP_ARG_X:
                if( isWS( actChar ) ) {
                    state = ISP_ARG_WS;
                    break;
                }
                if( isAZ( actChar ) ) {
                    state = ISI_ARG;
                    break;
                }
                switch( actChar ) {
                case '/':
                case '>':
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_OPEN;
                case '<':
                    state = INIT;
                    continue;           // don't eat !!!
                case '=':
                    offset++;
                    state = ISP_EQ;
                    return HtmlTokenContext.OPERATOR;
                default:
                    state = ISI_ERROR;
                    continue;
                }
                //break;
                
            case ISP_ARG_WS:
                if( isWS( actChar ) ) break;    // Eat all WhiteSpace
                state = ISP_ARG_X;
                return HtmlTokenContext.WS;

            case ISP_EQ:
                if( isWS( actChar ) ) {
                    state = ISP_EQ_WS;
                    break;
                }
                switch( actChar ) {
                case '\'':
                    state = ISI_VAL_QUOT;
                    break;
                case '"':
                    state = ISI_VAL_DQUOT;
                    break;
                case '>':               
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.TAG_OPEN;
                default:
                    state = ISI_VAL; //everything else if attribute value
                    break;
                }
                break;

            case ISP_EQ_WS:
                if( isWS( actChar ) ) break;    // Consume all WS
                state = ISP_EQ;
                return HtmlTokenContext.WS;


            case ISI_VAL:
                if( !isWS( actChar ) 
                    && !(actChar == '/' || actChar == '>' || actChar == '<')) break;  // Consume whole value
                state = ISP_TAG_X;
                return HtmlTokenContext.VALUE;

            case ISI_VAL_QUOT:
                switch( actChar ) {
                case '\'':
                    offset++;
                    state = ISP_TAG_X;
                    return HtmlTokenContext.VALUE;
                case '&':
                    if( offset == tokenOffset ) {
                        subState = state;
                        state = ISA_REF;
                        break;
                    } else {
                        return HtmlTokenContext.VALUE;
                    }
                }
                break;  // else simply consume next char of VALUE

            case ISI_VAL_DQUOT:
                switch( actChar ) {
                case '"':
                    offset++;
                    state = ISP_TAG_X;
                    return HtmlTokenContext.VALUE;
                case '&':
                    if( offset == tokenOffset ) {
                        subState = state;
                        state = ISA_REF;
                        break;
                    } else {
                        return HtmlTokenContext.VALUE;
                    }
                }
                break;  // else simply consume next char of VALUE



            case ISA_SGML_ESCAPE:       // DONE
                if( isAZ(actChar) ) {
                    state = ISI_SGML_DECL;
                    break;
                }
                switch( actChar ) {
                case '-':
                    state = ISA_SGML_DASH;
                    break;
                default:
                    state = ISI_TEXT;
                    continue;
                }
                break;

            case ISA_SGML_DASH:       // DONE
                switch( actChar ) {
                case '-':
                    state = ISI_HTML_COMMENT;
                    break;
                default:
                    state = ISI_TEXT;
                    continue;
                }
                break;

            case ISI_HTML_COMMENT:        // DONE
                switch( actChar ) {
                case '-':
                    state = ISA_HTML_COMMENT_DASH;
                    break;
                //create an HTML comment token for each line of the comment - a performance fix for #43532
                case '\n':
                    offset++;
                    //leave the some state - we are still in an HTML comment,
                    //we just need to create a token for each line.
                    return HtmlTokenContext.BLOCK_COMMENT;
                }
                break;

            case ISA_HTML_COMMENT_DASH:
                switch( actChar ) {
                case '-':
                    state = ISI_HTML_COMMENT_WS;
                    break;
                default:
                    state = ISI_HTML_COMMENT;
                    continue;
                }
                break;

            case ISI_HTML_COMMENT_WS:       // DONE
                if( isWS( actChar ) ) break;  // Consume all WS
                switch( actChar ) {
                case '>':
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.BLOCK_COMMENT;
                default:
                    state = ISI_HTML_COMMENT;
                    continue;
                }
                //break;

            case ISI_SGML_DECL:
                switch( actChar ) {
                case '>':
                    offset++;
                    state = INIT;
                    return HtmlTokenContext.DECLARATION;
                case '-':
                    if( offset == tokenOffset ) {
                        state = ISA_SGML_DECL_DASH;
                        break;
                    } else {
                        return HtmlTokenContext.DECLARATION;
                    }
                }
                break;

            case ISA_SGML_DECL_DASH:
                if( actChar == '-' ) {
                    state = ISI_SGML_COMMENT;
                    break;
                } else {
                    state = ISI_SGML_DECL;
                    continue;
                }

            case ISI_SGML_COMMENT:
                switch( actChar ) {
                case '-':
                    state = ISA_SGML_COMMENT_DASH;
                    break;
                }
                break;

            case ISA_SGML_COMMENT_DASH:
                if( actChar == '-' ) {
                    offset++;
                    state = ISI_SGML_DECL;
                    return HtmlTokenContext.SGML_COMMENT;
                } else {
                    state = ISI_SGML_COMMENT;
                    continue;
                }


            case ISA_REF:
                if( isAZ( actChar ) ) {
                    state = ISI_REF_NAME;
                    break;
                }
                if( actChar == '#' ) {
                    state = ISA_REF_HASH;
                    break;
                }
                state = subState;
                continue;

            case ISI_REF_NAME:
                if( isName( actChar ) ) break;
                if( actChar == ';' ) offset++;
                state = subState;
                return HtmlTokenContext.CHARACTER;

            case ISA_REF_HASH:
                if( actChar >= '0' && actChar <= '9' ) {
                    state = ISI_REF_DEC;
                    break;
                }
                if( actChar == 'x' || actChar == 'X' ) {
                    state = ISA_REF_X;
                    break;
                }
                if( isAZ( actChar ) ) {
                    offset++;
                    state = subState;
                    return HtmlTokenContext.ERROR;
                }
                state = subState;
                continue;

            case ISI_REF_DEC:
                if( actChar >= '0' && actChar <= '9' ) break;
                if( actChar == ';' ) offset++;
                state = subState;
                return HtmlTokenContext.CHARACTER;

            case ISA_REF_X:
                if( (actChar >= '0' && actChar <= '9') ||
                        (actChar >= 'a' && actChar <= 'f') ||
                        (actChar >= 'A' && actChar <= 'F')
                  ) {
                    state = ISI_REF_HEX;
                    break;
                }
                state = subState;
                return HtmlTokenContext.ERROR;       // error on previous "&#x" sequence

            case ISI_REF_HEX:
                if( (actChar >= '0' && actChar <= '9') ||
                        (actChar >= 'a' && actChar <= 'f') ||
                        (actChar >= 'A' && actChar <= 'F')
                  ) break;
                if( actChar == ';' ) offset++;
                state = subState;
                return HtmlTokenContext.CHARACTER;
            }


            offset = ++offset;
        } // end of while(offset...)

        /** At this stage there's no more text in the scanned buffer.
        * Scanner first checks whether this is completely the last
        * available buffer.
        */    
        if( lastBuffer ) {
            switch( state ) {
            case INIT:
            case ISI_TEXT:
            case ISA_LT:
            case ISA_SLASH:
            case ISA_SGML_ESCAPE:
            case ISA_SGML_DASH:
            case ISI_TAG_SLASH:
                return HtmlTokenContext.TEXT;

            case ISA_REF:
            case ISA_REF_HASH:
                if( subState == ISI_TEXT ) return HtmlTokenContext.TEXT;
                else return HtmlTokenContext.VALUE;

            case ISI_HTML_COMMENT:
            case ISA_HTML_COMMENT_DASH:
            case ISI_HTML_COMMENT_WS:
                return HtmlTokenContext.BLOCK_COMMENT;

            case ISI_TAG:
                return HtmlTokenContext.TAG_OPEN;
            case ISI_ENDTAG:
                return HtmlTokenContext.TAG_CLOSE;

            case ISI_ARG:
                return HtmlTokenContext.ARGUMENT;

            case ISI_ERROR:
                return HtmlTokenContext.ERROR;

            case ISP_ARG_WS:
            case ISP_TAG_WS:
            case ISP_ENDTAG_WS:
            case ISP_EQ_WS:
                return HtmlTokenContext.WS;

            case ISP_ARG_X:
            case ISP_TAG_X:
            case ISP_ENDTAG_X:
            case ISP_EQ:
                return HtmlTokenContext.WS;

            case ISI_VAL:
            case ISI_VAL_QUOT:
            case ISI_VAL_DQUOT:
                return HtmlTokenContext.VALUE;

            case ISI_SGML_DECL:
            case ISA_SGML_DECL_DASH:
                return HtmlTokenContext.DECLARATION;

            case ISI_SGML_COMMENT:
            case ISA_SGML_COMMENT_DASH:
                return HtmlTokenContext.SGML_COMMENT;

            case ISI_REF_NAME:
            case ISI_REF_DEC:
            case ISA_REF_X:
            case ISI_REF_HEX:
                return HtmlTokenContext.CHARACTER;
            }
        }

        return null;
    }