in japanese-addon/src/main/java/opennlp/tools/util/featuregen/lang/jpn/TokenPatternFeatureGenerator.java [55:89]
public void createFeatures(List<String> feats, String[] toks, int index, String[] preds) {
String[] tokenized = tokenizer.tokenize(toks[index]);
if (tokenized.length == 1) {
feats.add("st=" + StringUtil.toLowerCase(toks[index]));
return;
}
feats.add("stn=" + tokenized.length);
StringBuilder pattern = new StringBuilder();
for (int i = 0; i < tokenized.length; i++) {
if (i < tokenized.length - 1) {
feats.add("pt2=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]));
}
if (i < tokenized.length - 2) {
feats.add("pt3=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 2]));
}
pattern.append(FeatureGeneratorUtil.tokenFeature(tokenized[i]));
if (!noLetters.matcher(tokenized[i]).find()) {
feats.add("st=" + StringUtil.toLowerCase(tokenized[i]));
}
}
feats.add("pta=" + pattern.toString());
}