private static IntCharSet getPreClass()

in jflex/src/main/java/jflex/core/RegExp.java [533:683]


  private static IntCharSet getPreClass(
      Map<Integer, IntCharSet> preclassCache, CharClasses charClasses, int type) {
    IntCharSet result = preclassCache.get(type);
    if (null == result) {
      UnicodeProperties unicodeProperties = charClasses.getUnicodeProperties();
      switch (type) {
        case sym.LETTERCLASS:
          result = unicodeProperties.getIntCharSet("L");
          break;

        case sym.DIGITCLASS:
          result = unicodeProperties.getIntCharSet("Nd");
          break;

        case sym.DIGITCLASSNOT:
          IntCharSet digits = unicodeProperties.getIntCharSet("Nd");
          result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
          result.sub(digits);
          break;

        case sym.UPPERCLASS:
          // "Uppercase" is more than Uppercase_Letter, but older Unicode
          // versions don't have this definition - check for "Uppercase",
          // then fall back to Uppercase_Letter (Lu) if it does not exist.
          result = unicodeProperties.getIntCharSet("Uppercase");
          if (null == result) {
            result = unicodeProperties.getIntCharSet("Lu");
          }
          break;

        case sym.LOWERCLASS:
          // "Lowercase" is more than Lowercase_Letter, but older Unicode
          // versions don't have this definition - check for "Lowercase",
          // then fall back to Lowercase_Letter (Ll) if it does not exist.
          result = unicodeProperties.getIntCharSet("Lowercase");
          if (null == result) {
            result = unicodeProperties.getIntCharSet("Ll");
          }
          break;

        case sym.WHITESPACECLASS:
          // Although later versions do, Unicode 1.1 does not have the
          // "Whitespace" definition - check for "Whitespace", then fall back
          // to "Space_separator" (Zs) if it does not exist.
          result = unicodeProperties.getIntCharSet("Whitespace");
          if (null == result) {
            result = unicodeProperties.getIntCharSet("Zs");
          }
          break;

        case sym.WHITESPACECLASSNOT:
          // Although later versions do, Unicode 1.1 does not have the
          // "Whitespace" definition - check for "Whitespace", then fall back
          // to "Space_separator" (Zs) if it does not exist.
          IntCharSet whitespaceClass = unicodeProperties.getIntCharSet("Whitespace");
          if (null == whitespaceClass) {
            whitespaceClass = unicodeProperties.getIntCharSet("Zs");
          }
          result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
          result.sub(whitespaceClass);
          break;

        case sym.WORDCLASS:
          {
            // UTR#18: \w = [\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
            IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
            if (null == alphaClass) {
              // For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
              alphaClass = unicodeProperties.getIntCharSet("L");
            }
            IntCharSet markClass = unicodeProperties.getIntCharSet("M");
            IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
            IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
            if (null == connectorPunctClass) {
              // For Unicode 1.1, substitute "_" for "Connector_Punctuation".
              connectorPunctClass = IntCharSet.ofCharacter('_');
            }
            result = IntCharSet.copyOf(alphaClass);
            result.add(markClass);
            result.add(digitClass);
            result.add(connectorPunctClass);
            break;
          }

        case sym.WORDCLASSNOT:
          {
            // UTR#18: \W = [^\p{alpha}\p{gc=Mark}\p{digit}\p{gc=Connector_Punctuation}]
            IntCharSet alphaClass = unicodeProperties.getIntCharSet("Alphabetic");
            if (null == alphaClass) {
              // For Unicode 1.1, substitute "Letter" (L) for "Alphabetic".
              alphaClass = unicodeProperties.getIntCharSet("L");
            }
            IntCharSet markClass = unicodeProperties.getIntCharSet("M");
            IntCharSet digitClass = unicodeProperties.getIntCharSet("Nd");
            IntCharSet connectorPunctClass = unicodeProperties.getIntCharSet("Pc");
            if (null == connectorPunctClass) {
              // For Unicode 1.1, substitute "_" for "Connector_Punctuation".
              connectorPunctClass = IntCharSet.ofCharacter('_');
            }
            IntCharSet wordClass = IntCharSet.copyOf(alphaClass);
            wordClass.add(markClass);
            wordClass.add(digitClass);
            wordClass.add(connectorPunctClass);
            result = IntCharSet.ofCharacterRange(0, unicodeProperties.getMaximumCodePoint());
            result.sub(wordClass);
            break;
          }

        case sym.JLETTERCLASS:
        case sym.JLETTERDIGITCLASS:
          result = new IntCharSet();

          int c = 0;
          int start = 0;
          int last = charClasses.getMaxCharCode();

          boolean prev, current;

          prev = checkJPartStart(type, 0);

          for (c = 1; c < last; c++) {

            current = checkJPartStart(type, c);

            if (!prev && current) start = c;
            if (prev && !current) {
              result.add(new Interval(start, c - 1));
            }

            prev = current;
          }

          // the last iteration is moved out of the loop to
          // avoid an endless loop if last == maxCharCode and
          // last+1 == 0
          current = checkJPartStart(type, c);

          if (!prev && current) result.add(new Interval(c, c));
          if (prev && current) result.add(new Interval(start, c));
          if (prev && !current) result.add(new Interval(start, c - 1));
          break;

        default:
          throw new CharClassException("Unknown predefined char class type: " + type);
      }

      preclassCache.put(type, result);
    }

    return result;
  }