in jflex/src/main/java/jflex/core/RegExp.java [533:683]
private static IntCharSet getPreClass(
Map<Integer, IntCharSet> preclassCache, CharClasses charClasses, int type) {
IntCharSet result = preclassCache.get(type);
if (null == result) {
UnicodeProperties unicodeProperties = charClasses.getUnicodeProperties();
switch (type) {
case sym.LETTERCLASS:
result = unicodeProperties.getIntCharSet("L");
break;
case sym.DIGITCLASS:
result = unicodeProperties.getIntCharSet("Nd");
break;
case sym.DIGITCLASSNOT:
IntCharSet digits = unicodeProperties.getIntCharSet("Nd");
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(digits);
break;
case sym.UPPERCLASS:
// "Uppercase" is more than Uppercase_Letter, but older Unicode
// versions don't have this definition - check for "Uppercase",
// then fall back to Uppercase_Letter (Lu) if it does not exist.
result = unicodeProperties.getIntCharSet("Uppercase");
if (null == result) {
result = unicodeProperties.getIntCharSet("Lu");
}
break;
case sym.LOWERCLASS:
// "Lowercase" is more than Lowercase_Letter, but older Unicode
// versions don't have this definition - check for "Lowercase",
// then fall back to Lowercase_Letter (Ll) if it does not exist.
result = unicodeProperties.getIntCharSet("Lowercase");
if (null == result) {
result = unicodeProperties.getIntCharSet("Ll");
}
break;
case sym.WHITESPACECLASS:
// Although later versions do, Unicode 1.1 does not have the
// "Whitespace" definition - check for "Whitespace", then fall back
// to "Space_separator" (Zs) if it does not exist.
result = unicodeProperties.getIntCharSet("Whitespace");
if (null == result) {
result = unicodeProperties.getIntCharSet("Zs");
}
break;
case sym.WHITESPACECLASSNOT:
// Although later versions do, Unicode 1.1 does not have the
// "Whitespace" definition - check for "Whitespace", then fall back
// to "Space_separator" (Zs) if it does not exist.
IntCharSet whitespaceClass = unicodeProperties.getIntCharSet("Whitespace");
if (null == whitespaceClass) {
whitespaceClass = unicodeProperties.getIntCharSet("Zs");
}
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(whitespaceClass);
break;
case sym.WORDCLASS:
{
// UTR#18: \w = [\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
if (null == alphaClass) {
// For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
alphaClass = unicodeProperties.getIntCharSet("L");
}
IntCharSet markClass = unicodeProperties.getIntCharSet("M");
IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
if (null == connectorPunctClass) {
// For Unicode 1.1, substitute "_" for "Connector_Punctuation".
connectorPunctClass = IntCharSet.ofCharacter('_');
}
result = IntCharSet.copyOf(alphaClass);
result.add(markClass);
result.add(digitClass);
result.add(connectorPunctClass);
break;
}
case sym.WORDCLASSNOT:
{
// UTR#18: \W = [^\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
if (null == alphaClass) {
// For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
alphaClass = unicodeProperties.getIntCharSet("L");
}
IntCharSet markClass = unicodeProperties.getIntCharSet("M");
IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
if (null == connectorPunctClass) {
// For Unicode 1.1, substitute "_" for "Connector_Punctuation".
connectorPunctClass = IntCharSet.ofCharacter('_');
}
IntCharSet wordClass = IntCharSet.copyOf(alphaClass);
wordClass.add(markClass);
wordClass.add(digitClass);
wordClass.add(connectorPunctClass);
result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
result.sub(wordClass);
break;
}
case sym.JLETTERCLASS:
case sym.JLETTERDIGITCLASS:
result = new IntCharSet();
int c = 0;
int start = 0;
int last = charClasses.getMaxCharCode();
boolean prev, current;
prev = checkJPartStart(type, 0);
for (c = 1; c < last; c++) {
current = checkJPartStart(type, c);
if (!prev && current) start = c;
if (prev && !current) {
result.add(new Interval(start, c - 1));
}
prev = current;
}
// the last iteration is moved out of the loop to
// avoid an endless loop if last == maxCharCode and
// last+1 == 0
current = checkJPartStart(type, c);
if (!prev && current) result.add(new Interval(c, c));
if (prev && current) result.add(new Interval(start, c));
if (prev && !current) result.add(new Interval(start, c - 1));
break;
default:
throw new CharClassException("Unknown predefined char class type: " + type);
}
preclassCache.put(type, result);
}
return result;
}