public static StringPattern recognize()

in opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java [49:141]


  public static StringPattern recognize(String token) {

    int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT | ALL_LETTERS
        | ALL_HIRAGANA | ALL_KATAKANA;

    int digits = 0;

    for (int i = 0; i < token.length(); i++) {
      final char ch = token.charAt(i);
      final int letterType = Character.getType(ch);
      boolean isLetter = letterType == Character.UPPERCASE_LETTER ||
          letterType == Character.LOWERCASE_LETTER ||
          letterType == Character.TITLECASE_LETTER ||
          letterType == Character.MODIFIER_LETTER ||
          letterType == Character.OTHER_LETTER;

      if (isLetter) {
        pattern |= CONTAINS_LETTERS;
        pattern &= ~ALL_DIGIT;

        if (letterType == Character.UPPERCASE_LETTER) {
          if (i == 0) {
            pattern |= INITAL_CAPITAL_LETTER;
          }

          pattern |= CONTAINS_UPPERCASE;

          pattern &= ~ALL_LOWERCASE_LETTER;
        } else {
          pattern &= ~ALL_CAPITAL_LETTER;
        }
      } else {
        // contains chars other than letter, this means
        // it can not be one of these:
        pattern &= ~ALL_LETTERS;
        pattern &= ~ALL_CAPITAL_LETTER;
        pattern &= ~ALL_LOWERCASE_LETTER;

        if (letterType == Character.DECIMAL_DIGIT_NUMBER) {
          pattern |= CONTAINS_DIGIT;
          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
          digits++;
        } else {
          pattern &= ~ALL_DIGIT;
        }

        switch (ch) {
          case ',':
            pattern |= CONTAINS_COMMA;
            break;

          case '.':
            pattern |= CONTAINS_PERIOD;
            break;

          case '/':
            pattern |= CONTAINS_SLASH;
            break;

          case '-':
            pattern |= CONTAINS_HYPHEN;
            break;

          default:
            break;
        }
      }

      // for Japanese...
      final int codePoint = token.codePointAt(i);
      final Character.UnicodeScript us = Character.UnicodeScript.of(codePoint);
      if (us != Character.UnicodeScript.COMMON) {
        if (us == Character.UnicodeScript.LATIN) {
          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
        }
        else if (us == Character.UnicodeScript.HAN) {
          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA | ALL_LOWERCASE_LETTER);
        }
        else if (us == Character.UnicodeScript.HIRAGANA) {
          pattern &= ~(ALL_KATAKANA | ALL_LOWERCASE_LETTER);
        }
        else if (us == Character.UnicodeScript.KATAKANA) {
          pattern &= ~(ALL_HIRAGANA | ALL_LOWERCASE_LETTER);
        }
      }
      else {
        if (ch != '・' && ch != 'ー' && ch != '〜')
          pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
      }
    }

    return new StringPattern(pattern, digits);
  }