in japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtil.java [41:89]
public static String tokenFeature(String token) {
Objects.requireNonNull(token, "token must be not null!");
if (token.isEmpty()) return "other";
// scan token only once
char c = token.charAt(0);
if (Character.isDigit(c)) {
for (int i = 1; i < token.length(); i++) {
c = token.charAt(i);
if (!Character.isDigit(c)) return "other";
}
return "digit";
}
else {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub.equals(Character.UnicodeBlock.HIRAGANA)) {
for (int i = 1; i < token.length(); i++) {
c = token.charAt(i);
if (c != '・' && c != 'ー' && c != '〜') {
ub = Character.UnicodeBlock.of(c);
if (!ub.equals(Character.UnicodeBlock.HIRAGANA)) return "other";
}
}
return "hira";
}
else if (ub.equals(Character.UnicodeBlock.KATAKANA)) {
for (int i = 1; i < token.length(); i++) {
c = token.charAt(i);
if (c != '・' && c != 'ー' && c != '〜') {
ub = Character.UnicodeBlock.of(c);
if (!ub.equals(Character.UnicodeBlock.KATAKANA)) return "other";
}
}
return "kata";
}
else if (Character.isAlphabetic(c) &&
!ub.equals(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
for (int i = 1; i < token.length(); i++) {
c = token.charAt(i);
if (!Character.isAlphabetic(c)) return "other";
}
return "alpha";
}
}
return "other";
}