in opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java [49:141]
public static StringPattern recognize(String token) {
int pattern = ALL_CAPITAL_LETTER | ALL_LOWERCASE_LETTER | ALL_DIGIT | ALL_LETTERS
| ALL_HIRAGANA | ALL_KATAKANA;
int digits = 0;
for (int i = 0; i < token.length(); i++) {
final char ch = token.charAt(i);
final int letterType = Character.getType(ch);
boolean isLetter = letterType == Character.UPPERCASE_LETTER ||
letterType == Character.LOWERCASE_LETTER ||
letterType == Character.TITLECASE_LETTER ||
letterType == Character.MODIFIER_LETTER ||
letterType == Character.OTHER_LETTER;
if (isLetter) {
pattern |= CONTAINS_LETTERS;
pattern &= ~ALL_DIGIT;
if (letterType == Character.UPPERCASE_LETTER) {
if (i == 0) {
pattern |= INITAL_CAPITAL_LETTER;
}
pattern |= CONTAINS_UPPERCASE;
pattern &= ~ALL_LOWERCASE_LETTER;
} else {
pattern &= ~ALL_CAPITAL_LETTER;
}
} else {
// contains chars other than letter, this means
// it can not be one of these:
pattern &= ~ALL_LETTERS;
pattern &= ~ALL_CAPITAL_LETTER;
pattern &= ~ALL_LOWERCASE_LETTER;
if (letterType == Character.DECIMAL_DIGIT_NUMBER) {
pattern |= CONTAINS_DIGIT;
pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
digits++;
} else {
pattern &= ~ALL_DIGIT;
}
switch (ch) {
case ',':
pattern |= CONTAINS_COMMA;
break;
case '.':
pattern |= CONTAINS_PERIOD;
break;
case '/':
pattern |= CONTAINS_SLASH;
break;
case '-':
pattern |= CONTAINS_HYPHEN;
break;
default:
break;
}
}
// for Japanese...
final int codePoint = token.codePointAt(i);
final Character.UnicodeScript us = Character.UnicodeScript.of(codePoint);
if (us != Character.UnicodeScript.COMMON) {
if (us == Character.UnicodeScript.LATIN) {
pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
}
else if (us == Character.UnicodeScript.HAN) {
pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA | ALL_LOWERCASE_LETTER);
}
else if (us == Character.UnicodeScript.HIRAGANA) {
pattern &= ~(ALL_KATAKANA | ALL_LOWERCASE_LETTER);
}
else if (us == Character.UnicodeScript.KATAKANA) {
pattern &= ~(ALL_HIRAGANA | ALL_LOWERCASE_LETTER);
}
}
else {
if (ch != '・' && ch != 'ー' && ch != '〜')
pattern &= ~(ALL_HIRAGANA | ALL_KATAKANA);
}
}
return new StringPattern(pattern, digits);
}