public static String tokenFeature()

in japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/FeatureGeneratorUtil.java [41:89]


  public static String tokenFeature(String token) {

    Objects.requireNonNull(token, "token must be not null!");

    if (token.isEmpty()) return "other";

    // scan token only once
    char c = token.charAt(0);
    if (Character.isDigit(c)) {
      for (int i = 1; i < token.length(); i++) {
        c = token.charAt(i);
        if (!Character.isDigit(c)) return "other";
      }
      return "digit";
    }
    else {
      Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
      if (ub.equals(Character.UnicodeBlock.HIRAGANA)) {
        for (int i = 1; i < token.length(); i++) {
          c = token.charAt(i);
          if (c != '・' && c != 'ー' && c != '〜') {
            ub = Character.UnicodeBlock.of(c);
            if (!ub.equals(Character.UnicodeBlock.HIRAGANA)) return "other";
          }
        }
        return "hira";
      }
      else if (ub.equals(Character.UnicodeBlock.KATAKANA)) {
        for (int i = 1; i < token.length(); i++) {
          c = token.charAt(i);
          if (c != '・' && c != 'ー' && c != '〜') {
            ub = Character.UnicodeBlock.of(c);
            if (!ub.equals(Character.UnicodeBlock.KATAKANA)) return "other";
          }
        }
        return "kata";
      }
      else if (Character.isAlphabetic(c) &&
          !ub.equals(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
        for (int i = 1; i < token.length(); i++) {
          c = token.charAt(i);
          if (!Character.isAlphabetic(c)) return "other";
        }
        return "alpha";
      }
    }

    return "other";
  }