private static int getCharacterType()

in WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java [247:306]


  private static int getCharacterType(char character) {

    switch (Character.getType(character)) {

    // letter characters
    case Character.UPPERCASE_LETTER:
    case Character.LOWERCASE_LETTER:
    case Character.TITLECASE_LETTER:
    case Character.MODIFIER_LETTER:
    case Character.OTHER_LETTER:
    case Character.NON_SPACING_MARK:
    case Character.ENCLOSING_MARK:
    case Character.COMBINING_SPACING_MARK:
    case Character.PRIVATE_USE:
    case Character.SURROGATE:
    case Character.MODIFIER_SYMBOL:
      return CH_LETTER;

      // number characters
    case Character.DECIMAL_DIGIT_NUMBER:
    case Character.LETTER_NUMBER:
    case Character.OTHER_NUMBER:
      return CH_NUMBER;

      // whitespace characters
    case Character.SPACE_SEPARATOR:
      // case Character.CONNECTOR_PUNCTUATION:
      return CH_WHITESPACE;

    case Character.DASH_PUNCTUATION:
    case Character.START_PUNCTUATION:
    case Character.END_PUNCTUATION:
    case Character.OTHER_PUNCTUATION:
      return CH_PUNCTUATION;

    case Character.LINE_SEPARATOR:
    case Character.PARAGRAPH_SEPARATOR:
      return CH_NEWLINE;

    case Character.CONTROL:
      if (character == '\n' || character == '\r') {
        return CH_NEWLINE;
      } else {
        // tab is in the char category CONTROL
        if (Character.isWhitespace(character)) {
          return CH_WHITESPACE;
        }
        return CH_SPECIAL;
      }

    default:
      // the isWhitespace test is slightly more expensive than the above switch,
      // so it is placed here to avoid performance impact.
      // Also, calling code has explicit tests for CH_NEWLINE, and this test should not swallow those
      if (Character.isWhitespace(character)) {
        return CH_WHITESPACE;
      }
      return CH_SPECIAL;
    }
  }