in jena-arq/src/main/java/org/apache/jena/ext/xerces_regex/ParserForXMLSchema.java [196:336]
protected RangeToken parseCharacterClass(boolean useNrange) throws RegexParseException {
this.setContext(S_INBRACKETS);
this.next(); // '['
boolean nrange = false;
boolean wasDecoded = false; // used to detect if the last - was escaped.
RangeToken base = null;
RangeToken tok;
if (this.read() == T_CHAR && this.chardata == '^') {
nrange = true;
this.next(); // '^'
base = RX_Token.createRange();
base.addRange(0, RX_Token.UTF16_MAX);
tok = RX_Token.createRange();
} else {
tok = RX_Token.createRange();
}
int type;
boolean firstloop = true;
while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for this loop.
wasDecoded = false;
// single-range | from-to-range | subtraction
if (type == T_CHAR && this.chardata == ']' && !firstloop) {
if (nrange) {
base.subtractRanges(tok);
tok = base;
}
break;
}
int c = this.chardata;
boolean end = false;
if (type == T_BACKSOLIDUS) {
switch (c) {
case 'd': case 'D':
case 'w': case 'W':
case 's': case 'S':
tok.mergeRanges(this.getTokenForShorthand(c));
end = true;
break;
case 'i': case 'I':
case 'c': case 'C':
c = this.processCIinCharacterClass(tok, c);
if (c < 0) end = true;
break;
case 'p':
case 'P':
int pstart = this.offset;
RangeToken tok2 = this.processBacksolidus_pP(c);
if (tok2 == null) throw this.ex("parser.atom.5", pstart);
tok.mergeRanges(tok2);
end = true;
break;
case '-':
c = this.decodeEscaped();
wasDecoded = true;
break;
default:
c = this.decodeEscaped();
} // \ + c
} // backsolidus
else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
// Subraction
if (nrange) {
base.subtractRanges(tok);
tok = base;
}
RangeToken range2 = this.parseCharacterClass(false);
tok.subtractRanges(range2);
if (this.read() != T_CHAR || this.chardata != ']')
throw this.ex("parser.cc.5", this.offset);
break; // Exit this loop
}
this.next();
if (!end) { // if not shorthands...
if (type == T_CHAR) {
if (c == '[') throw this.ex("parser.cc.6", this.offset-2);
if (c == ']') throw this.ex("parser.cc.7", this.offset-2);
if (c == '-' && this.chardata != ']' && !firstloop) throw this.ex("parser.cc.8", this.offset-2); // if regex = '[-]' then invalid
}
if (this.read() != T_CHAR || this.chardata != '-' || c == '-' && firstloop) { // Here is no '-'.
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
} else { // Found '-'
// Is this '-' is a from-to token??
this.next(); // Skips '-'
if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
// c '-' ']' -> '-' is a single-range.
if(type == T_CHAR && this.chardata == ']') { // if - is at the last position of the group
if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) {
tok.addRange(c, c);
}
else {
addCaseInsensitiveChar(tok, c);
}
tok.addRange('-', '-');
}
else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
throw this.ex("parser.cc.8", this.offset-1);
} else {
int rangeend = this.chardata;
if (type == T_CHAR) {
if (rangeend == '[') throw this.ex("parser.cc.6", this.offset-1);
if (rangeend == ']') throw this.ex("parser.cc.7", this.offset-1);
if (rangeend == '-') throw this.ex("parser.cc.8", this.offset-2);
}
else if (type == T_BACKSOLIDUS)
rangeend = this.decodeEscaped();
this.next();
if (c > rangeend) throw this.ex("parser.ope.3", this.offset-1);
if (!this.isSet(RegularExpression.IGNORE_CASE) ||
(c > 0xffff && rangeend > 0xffff)) {
tok.addRange(c, rangeend);
}
else {
addCaseInsensitiveCharRange(tok, c, rangeend);
}
}
}
}
firstloop = false;
}
if (this.read() == T_EOF)
throw this.ex("parser.cc.2", this.offset);
tok.sortRanges();
tok.compactRanges();
//tok.dumpRanges();
this.setContext(S_NORMAL);
this.next(); // Skips ']'
return tok;
}