protected RangeToken parseCharacterClass()

in jena-arq/src/main/java/org/apache/jena/ext/xerces_regex/ParserForXMLSchema.java [196:336]


    protected RangeToken parseCharacterClass(boolean useNrange) throws RegexParseException {
        this.setContext(S_INBRACKETS);
        this.next();                            // '['
        boolean nrange = false;
        boolean wasDecoded = false;     		// used to detect if the last - was escaped.
        RangeToken base = null;
        RangeToken tok;
        if (this.read() == T_CHAR && this.chardata == '^') {
            nrange = true;
            this.next();                        // '^'
            base = RX_Token.createRange();
            base.addRange(0, RX_Token.UTF16_MAX);
            tok = RX_Token.createRange();
        } else {
            tok = RX_Token.createRange();
        }
        int type;
        boolean firstloop = true;
        while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for this loop.

        	wasDecoded = false;
            // single-range | from-to-range | subtraction
            if (type == T_CHAR && this.chardata == ']' && !firstloop) {
                if (nrange) {
                    base.subtractRanges(tok);
                    tok = base;
                }
                break;
            }
            int c = this.chardata;
            boolean end = false;
            if (type == T_BACKSOLIDUS) {
                switch (c) {
                  case 'd':  case 'D':
                  case 'w':  case 'W':
                  case 's':  case 'S':
                    tok.mergeRanges(this.getTokenForShorthand(c));
                    end = true;
                    break;

                  case 'i':  case 'I':
                  case 'c':  case 'C':
                    c = this.processCIinCharacterClass(tok, c);
                    if (c < 0)  end = true;
                    break;

                  case 'p':
                  case 'P':
                    int pstart = this.offset;
                    RangeToken tok2 = this.processBacksolidus_pP(c);
                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
                    tok.mergeRanges(tok2);
                    end = true;
                    break;

                 case '-':
                 	c = this.decodeEscaped();
                 	wasDecoded = true;
                 	break;

                  default:
                    c = this.decodeEscaped();
                } // \ + c
            } // backsolidus
            else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
                                                // Subraction
                if (nrange) {
                    base.subtractRanges(tok);
                    tok = base;
                }
                RangeToken range2 = this.parseCharacterClass(false);
                tok.subtractRanges(range2);
                if (this.read() != T_CHAR || this.chardata != ']')
                    throw this.ex("parser.cc.5", this.offset);
                break;                          // Exit this loop
            }
            this.next();
            if (!end) {                         // if not shorthands...
                if (type == T_CHAR) {
                    if (c == '[')  throw this.ex("parser.cc.6", this.offset-2);
                    if (c == ']')  throw this.ex("parser.cc.7", this.offset-2);
                    if (c == '-' && this.chardata != ']' && !firstloop)  throw this.ex("parser.cc.8", this.offset-2);	// if regex = '[-]' then invalid
                }
                if (this.read() != T_CHAR || this.chardata != '-' || c == '-' && firstloop) { // Here is no '-'.
                    if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
                        tok.addRange(c, c);
                    }
                    else {
                        addCaseInsensitiveChar(tok, c);
                    }
                } else {                        // Found '-'
                                                // Is this '-' is a from-to token??
                    this.next(); // Skips '-'
                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
                                                // c '-' ']' -> '-' is a single-range.
                    if(type == T_CHAR && this.chardata == ']') {				// if - is at the last position of the group
                        if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
                    	    tok.addRange(c, c);
                        }
                        else {
                            addCaseInsensitiveChar(tok, c);
                        }
                    	tok.addRange('-', '-');
                    }
                    else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
                        throw this.ex("parser.cc.8", this.offset-1);
                    } else {

                        int rangeend = this.chardata;
                        if (type == T_CHAR) {
                            if (rangeend == '[')  throw this.ex("parser.cc.6", this.offset-1);
                            if (rangeend == ']')  throw this.ex("parser.cc.7", this.offset-1);
                            if (rangeend == '-')  throw this.ex("parser.cc.8", this.offset-2);
                        }
                        else if (type == T_BACKSOLIDUS)
                            rangeend = this.decodeEscaped();
                        this.next();

                        if (c > rangeend)  throw this.ex("parser.ope.3", this.offset-1);
                        if (!this.isSet(RegularExpression.IGNORE_CASE) ||
                                (c > 0xffff && rangeend > 0xffff)) {
                            tok.addRange(c, rangeend);
                        }
                        else {
                            addCaseInsensitiveCharRange(tok, c, rangeend);
                        }
                    }
                }
            }
            firstloop = false;
        }
        if (this.read() == T_EOF)
            throw this.ex("parser.cc.2", this.offset);
        tok.sortRanges();
        tok.compactRanges();
        //tok.dumpRanges();
        this.setContext(S_NORMAL);
        this.next();                    // Skips ']'

        return tok;
    }