in opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java [375:439]
public static void populatePOSDictionary(ObjectStream<POSSample> samples,
MutableTagDictionary dict, int cutoff) throws IOException {
logger.info("Expanding POS Dictionary ...");
long start = System.nanoTime();
// the data structure will store the word, the tag, and the number of
// occurrences
Map<String, Map<String, AtomicInteger>> newEntries = new HashMap<>();
POSSample sample;
while ((sample = samples.read()) != null) {
String[] words = sample.getSentence();
String[] tags = sample.getTags();
for (int i = 0; i < words.length; i++) {
// only store words
if (!StringPattern.recognize(words[i]).containsDigit()) {
String word;
if (dict.isCaseSensitive()) {
word = words[i];
} else {
word = StringUtil.toLowerCase(words[i]);
}
if (!newEntries.containsKey(word)) {
newEntries.put(word, new HashMap<>());
}
String[] dictTags = dict.getTags(word);
if (dictTags != null) {
for (String tag : dictTags) {
// for this tags we start with the cutoff
Map<String, AtomicInteger> value = newEntries.get(word);
if (!value.containsKey(tag)) {
value.put(tag, new AtomicInteger(cutoff));
}
}
}
if (!newEntries.get(word).containsKey(tags[i])) {
newEntries.get(word).put(tags[i], new AtomicInteger(1));
} else {
newEntries.get(word).get(tags[i]).incrementAndGet();
}
}
}
}
// now we check if the word + tag pairs have enough occurrences, if yes we
// add it to the dictionary
for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries
.entrySet()) {
List<String> tagsForWord = new ArrayList<>();
for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) {
if (entry.getValue().get() >= cutoff) {
tagsForWord.add(entry.getKey());
}
}
if (!tagsForWord.isEmpty()) {
dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[0]));
}
}
logger.info("... finished expanding POS Dictionary. [ {} ms]", (System.nanoTime() - start) / 1000000);
}