in languagetool-language-modules/nl/src/main/java/org/languagetool/tagging/nl/DutchTagger.java [93:260]
public List<AnalyzedTokenReadings> tag(List<String> sentenceTokens) {
List<AnalyzedTokenReadings> tokenReadings = new ArrayList<>();
int pos = 0;
CompoundAcceptor compoundAcceptor = Dutch.getCompoundAcceptor();
for (String word : sentenceTokens) {
boolean ignoreSpelling = false;
// make treatment of weird apostrophes same as in tokenizer (R. Baars, 2020-11-06)
String originalWord = word;
word = word.replace('`', '\'').replace('’', '\'').replace('‘', '\'').replace('´', '\'');
List<AnalyzedToken> l = new ArrayList<>();
String lowerWord = word.toLowerCase(locale);
boolean isLowercase = word.equals(lowerWord);
boolean isMixedCase = StringTools.isMixedCase(word);
boolean isAllUpper = StringTools.isAllUppercase(word);
// assign tokens for flattened word to original word
List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(word));
//List<AnalyzedToken> taggerTokens = asAnalyzedTokenListForTaggedWords(word, getWordTagger().tag(word));
// normal case:
addTokens(taggerTokens, l);
// tag non-lowercase (alluppercase or startuppercase), but not mixedcase
// word with lowercase word tags:
if (!isLowercase && !isMixedCase) {
List<AnalyzedToken> lowerTaggerTokens = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(lowerWord));
addTokens(lowerTaggerTokens, l);
}
// tag all-uppercase proper nouns
if (l.isEmpty() && isAllUpper) {
String firstUpper = StringTools.uppercaseFirstChar(lowerWord);
List<AnalyzedToken> firstupperTaggerTokens = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(firstUpper));
addTokens(firstupperTaggerTokens, l);
}
if (l.isEmpty()) {
// there is still no postag found
//String word2 = lowerWord;
String word2 = word; // why the lowerword?
// remove single accented characters
word2 = PATTERN1_A.matcher(word2).replaceAll("$1a$3");
word2 = PATTERN1_E.matcher(word2).replaceAll("$1e$3");
word2 = PATTERN1_I.matcher(word2).replaceAll("$1i$3");
word2 = PATTERN1_O.matcher(word2).replaceAll("$1o$3");
word2 = PATTERN1_U.matcher(word2).replaceAll("$1u$3");
// remove allowed accented characters
word2 = CHAR_PATTERN_AA.matcher(word2).replaceAll("aa");
word2 = CHAR_PATTERN_AE.matcher(word2).replaceAll("ae");
word2 = CHAR_PATTERN_AI.matcher(word2).replaceAll("ai");
word2 = CHAR_PATTERN_AU.matcher(word2).replaceAll("au");
word2 = CHAR_PATTERN_EE.matcher(word2).replaceAll("ee");
word2 = CHAR_PATTERN_EI.matcher(word2).replaceAll("ei");
word2 = CHAR_PATTERN_EU.matcher(word2).replaceAll("eu");
word2 = CHAR_PATTERN_IE.matcher(word2).replaceAll("ie");
word2 = CHAR_PATTERN_OE.matcher(word2).replaceAll("oe");
word2 = CHAR_PATTERN_OI.matcher(word2).replaceAll("oi");
word2 = CHAR_PATTERN_OO.matcher(word2).replaceAll("oo");
word2 = CHAR_PATTERN_OU.matcher(word2).replaceAll("ou");
word2 = CHAR_PATTERN_UI.matcher(word2).replaceAll("ui");
word2 = CHAR_PATTERN_UU.matcher(word2).replaceAll("uu");
word2 = CHAR_PATTERN_IJ.matcher(word2).replaceAll("ij");
word2 = PATTERN2_A.matcher(word2).replaceAll("$1a$2");
word2 = PATTERN2_E.matcher(word2).replaceAll("$1e$2");
word2 = PATTERN2_I.matcher(word2).replaceAll("$1i$2");
word2 = PATTERN2_O.matcher(word2).replaceAll("$1o$2");
word2 = PATTERN2_U.matcher(word2).replaceAll("$1u$2");
// best would be to check the parts as well (uncompound)
if (word2.contains("-")) {
//String part1 = word2.replaceAll("(^.*)-(.*$)", "$1");
//List<AnalyzedToken> p1 = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(part1));
String part2 = HYPHEN1_PATTERN.matcher(word2).replaceAll("$2");
List<AnalyzedToken> p2 = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(part2));
//if (!(p1.isEmpty()||p2.isEmpty())) {
if (!p2.isEmpty()) {
// word is split on a likely location
word2 = HYPHEN2_PATTERN.matcher(word2).replaceAll("$1$2");
}
}
if (!word2.equals(word)) {
List<AnalyzedToken> l2 = asAnalyzedTokenListForTaggedWords(originalWord, getWordTagger().tag(word2));
if (!l2.isEmpty()) {
// woord bestaat
addTokens(l2, l);
ignoreSpelling = true;
}
}
// Tag unknown compound words:
if (l.isEmpty() && word.length() > 5) {
List<String> parts = compoundAcceptor.getParts(word);
if (parts.size() == 2) {
String part1 = parts.get(0);
String part2 = parts.get(1);
List<AnalyzedTokenReadings> part2ReadingsList = tag(Collections.singletonList(part2));
AnalyzedTokenReadings part2Readings = part2ReadingsList.get(0);
String part1lc = part1.toLowerCase();
for (AnalyzedToken part2Reading : part2Readings) {
if (part2Reading.getPOSTag() != null) {
// if part1 ends with a hyphen, check if we are dealing with geographical compound word
if (part1.endsWith("-")) {
if (part2Reading.getPOSTag().startsWith("ENM:LOC")) {
l.add(new AnalyzedToken(word, part2Reading.getPOSTag(), part2));
break;
}
}
if (part2Reading.getPOSTag().startsWith("ZNW")) {
String tag;
if (alwaysNeedsHet.contains(part2)) {
tag = "ZNW:EKV:HET";
} else if (alwaysNeedsDe.contains(part2)) {
tag = "ZNW:EKV:DE_";
} else if (alwaysNeedsMrv.contains(part2)) {
tag = "ZNW:MRV:DE_";
} else {
tag = part2Reading.getPOSTag();
}
l.add(new AnalyzedToken(word, tag, part1lc + part2Reading.getLemma()));
// if any of these lists contain part2 of the compound, exit the loop after adding a single tag
if (alwaysNeedsHet.contains(part2) || alwaysNeedsDe.contains(part2) || alwaysNeedsMrv.contains(part2)) {
break;
}
}
}
}
}
}
}
// set word to original
word = originalWord;
if (l.isEmpty()) {
l.add(new AnalyzedToken(originalWord, null, null));
}
AnalyzedTokenReadings atr = new AnalyzedTokenReadings(l, pos);
if (ignoreSpelling) {
// it might be a word that needs captials! Check this in dictionary
if (isLowercase) {
List<AnalyzedToken> fu = asAnalyzedTokenListForTaggedWords(StringTools.uppercaseFirstChar(originalWord), getWordTagger().tag(StringTools.uppercaseFirstChar(originalWord)));
if (fu.isEmpty()) {
// does not exist in dictionary having firstupper
atr.ignoreSpelling();
} else {
// there is an uppercased form in the dictionary; so this one is probably wrong
//System.out.println("=>"+l.toString());
// TODO clearing the l list does not work here; the 'LIKELY_SPELLING' tag should be removed! But somehow, this does not work when done here.
l.clear();
l.add(new AnalyzedToken(originalWord, null, null));
//System.out.println("=>"+l.toString());
}
} else {
atr.ignoreSpelling();
}
}
tokenReadings.add(atr);
pos += word.length();
}
return tokenReadings;
}