in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/CompoundTagger.java [1063:1278]
private List<AnalyzedToken> tagMatch(String word, List<AnalyzedToken> leftAnalyzedTokens, List<AnalyzedToken> rightAnalyzedTokens) {
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>();
List<AnalyzedToken> newAnalyzedTokensAnimInanim = new ArrayList<>();
String animInanimNotTagged = null;
for (AnalyzedToken leftAnalyzedToken : leftAnalyzedTokens) {
String leftPosTag = leftAnalyzedToken.getPOSTag();
if( leftPosTag == null
|| IPOSTag.contains(leftPosTag, IPOSTag.abbr.getText()) )
continue;
if( leftPosTag.startsWith("noun:inanim") ) {
// we don't want to have v_kly for рибо-полювання
// but we do for пане-товаришу
if( leftPosTag.contains("v_kly") )
continue;
}
String leftPosTagExtra = "";
boolean leftNv = false;
if( leftPosTag.contains(PosTagHelper.NO_VIDMINOK_SUBSTR) ) {
leftNv = true;
leftPosTag = leftPosTag.replace(PosTagHelper.NO_VIDMINOK_SUBSTR, "");
}
leftPosTag = dropExtra(leftPosTag);
Matcher matcher = EXTRA_TAGS.matcher(leftPosTag);
if( matcher.find() ) {
leftPosTagExtra += matcher.group();
leftPosTag = matcher.replaceAll("");
}
for (AnalyzedToken rightAnalyzedToken : rightAnalyzedTokens) {
String rightPosTag = rightAnalyzedToken.getPOSTag();
if( rightPosTag == null
// || rightPosTag.contains("v_kly")
|| rightPosTag.contains(IPOSTag.abbr.getText())
|| rightPosTag.contains("v_zna:var") )
continue;
if( rightPosTag.startsWith("noun:inanim") ) {
if (rightPosTag.contains("v_kly"))
continue;
// skip Гірник geo for Гірник-спорт
if( leftPosTag.contains(":geo")
&& ! rightPosTag.contains(":geo")
&& ! rightAnalyzedToken.getLemma().matches("(?iu)ріка|гора|місто|град|поле|море|парк") )
continue;
}
// країни-агресори - не треба v_zna:rare
if( rightPosTag.startsWith("noun:anim:p:v_zna:rare")
&& leftPosTag.startsWith("noun:inanim") )
continue;
String extraNvTag = "";
boolean rightNv = false;
if( rightPosTag.contains(PosTagHelper.NO_VIDMINOK_SUBSTR) ) {
rightNv = true;
if( leftNv ) {
extraNvTag += PosTagHelper.NO_VIDMINOK_SUBSTR;
}
}
rightPosTag = dropExtra(rightPosTag);
Matcher matcherR = EXTRA_TAGS.matcher(rightPosTag);
if( matcherR.find() ) {
rightPosTag = matcherR.replaceAll("");
}
if (stripPerfImperf(leftPosTag).equals(stripPerfImperf(rightPosTag))
&& (IPOSTag.startsWith(leftPosTag, IPOSTag.numr, IPOSTag.adv, IPOSTag.adj, IPOSTag.verb)
|| (leftPosTag.matches("intj|noninfl.*") // (onomat|predic).*
&& leftAnalyzedToken.getLemma().equalsIgnoreCase(rightAnalyzedToken.getLemma())) ) ) {
String newPosTag = leftPosTag + extraNvTag + leftPosTagExtra;
if( (leftPosTag.contains("adjp") && ! rightPosTag.contains("adjp"))
|| (! leftPosTag.contains("adjp") && rightPosTag.contains("adjp")) ) {
newPosTag = newPosTag.replaceFirst(":adjp:(actv|pasv):(im)?perf", "");
}
String newLemma = leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma();
newAnalyzedTokens.add(new AnalyzedToken(word, newPosTag, newLemma));
}
// noun-noun
else if ( leftPosTag.startsWith(IPOSTag.noun.getText()) && rightPosTag.startsWith(IPOSTag.noun.getText()) ) {
// discard чорний-чорний as noun:anim
// but allow дівчинка-дівчинка
// if( leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken())
// && leftPosTag.contains(TAG_ANIM) && rightPosTag.contains(TAG_ANIM) )
// continue;
String agreedPosTag = getAgreedPosTag(leftPosTag, rightPosTag, leftNv, word);
if( agreedPosTag == null
&& rightPosTag.startsWith("noun:inanim:m:v_naz")
&& isMinMax(rightAnalyzedToken.getToken()) ) {
agreedPosTag = leftPosTag;
}
if( agreedPosTag == null && ! isSameAnimStatus(leftPosTag, rightPosTag) ) {
agreedPosTag = tryAnimInanim(leftPosTag, rightPosTag, leftAnalyzedToken.getLemma(), rightAnalyzedToken.getLemma(), leftNv, rightNv, word);
if( agreedPosTag == null ) {
animInanimNotTagged = leftPosTag.contains(":anim") ? "anim-inanim" : "inanim-anim";
}
else {
newAnalyzedTokensAnimInanim.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
continue;
}
}
if( agreedPosTag != null ) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
}
// numr-numr: один-три
else if ( leftPosTag.startsWith(IPOSTag.numr.getText()) && rightPosTag.startsWith(IPOSTag.numr.getText()) ) {
String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
if( agreedPosTag != null ) {
if( rightPosTag.contains(":p:") && ! agreedPosTag.contains(":p:") ) {
agreedPosTag = agreedPosTag.replaceFirst(":[mfn]:", ":p:");
}
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
}
// noun-numr match
else if ( IPOSTag.startsWith(leftPosTag, IPOSTag.noun) && IPOSTag.startsWith(rightPosTag, IPOSTag.numr) ) {
if( ! leftAnalyzedToken.getLemma().equals("п'ята") ) {
// gender tags match
String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
if( leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag)) ) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
// година-півтори може бути як одниною так і множиною: минула година-півтори, минули година-півтори
if( ! leftPosTag.contains(":p:") ) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
}
else {
// (with different gender tags): сотні (:p:) - дві (:f:)
String agreedPosTag = getNumAgreedPosTag(leftPosTag, rightPosTag, leftNv);
if( agreedPosTag != null ) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
// рік-два може бути як одниною так і множиною: минулий рік-два, минули рік-два
if( ! agreedPosTag.contains(":p:") ) {
newAnalyzedTokens.add(new AnalyzedToken(word, agreedPosTag.replaceAll(":[mfn]:", ":p:") + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
}
}
}
}
// noun-adj match: Буш-молодший, рік-два
// не робимо братів-православних — загальний noun-adj дає забагато фальшивих спрацьовувань
else if( leftPosTag.startsWith(IPOSTag.noun.getText())
&& IPOSTag.startsWith(rightPosTag, IPOSTag.numr)
|| (IPOSTag.startsWith(rightPosTag, IPOSTag.adj) && isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken)) ) {
// if( ! leftPosTag.contains(":prop")
// || isJuniorSenior(leftAnalyzedToken, rightAnalyzedToken) ) {
// discard чорний-чорний as noun:anim
// if( leftAnalyzedToken.getToken().equalsIgnoreCase(rightAnalyzedToken.getToken()) )
// continue;
String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
if( leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag)) ) {
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightAnalyzedToken.getLemma()));
}
// }
}
// чарка-друга
else if( leftPosTag.startsWith(IPOSTag.noun.getText())
&& rightAnalyzedToken.getLemma().equals("другий")
) {
String leftGenderConj = PosTagHelper.getGenderConj(leftPosTag);
if( leftGenderConj != null && leftGenderConj.equals(PosTagHelper.getGenderConj(rightPosTag)) ) {
String rightLemma = leftGenderConj.startsWith("m") ? "другий" :
leftGenderConj.startsWith("f") ? "друга" : "друге";
newAnalyzedTokens.add(new AnalyzedToken(word, leftPosTag + extraNvTag + leftPosTagExtra, leftAnalyzedToken.getLemma() + "-" + rightLemma));
}
}
}
}
if( ! newAnalyzedTokens.isEmpty()
&& ! PosTagHelper.hasPosTagPart(newAnalyzedTokens, ":p:") ) {
if( (LemmaHelper.hasLemma(leftAnalyzedTokens, LemmaHelper.DAYS_OF_WEEK) && LemmaHelper.hasLemma(rightAnalyzedTokens, LemmaHelper.DAYS_OF_WEEK))
|| (LemmaHelper.hasLemma(leftAnalyzedTokens, LemmaHelper.MONTH_LEMMAS) && LemmaHelper.hasLemma(rightAnalyzedTokens, LemmaHelper.MONTH_LEMMAS)) ) {
newAnalyzedTokens.add(new AnalyzedToken(word, newAnalyzedTokens.get(0).getPOSTag().replaceAll(":[mfn]:", ":p:"), newAnalyzedTokens.get(0).getLemma()));
}
}
// remove duplicates
newAnalyzedTokens = new ArrayList<>(new LinkedHashSet<>(newAnalyzedTokens));
if( newAnalyzedTokens.isEmpty() ) {
newAnalyzedTokens = newAnalyzedTokensAnimInanim;
}
if( animInanimNotTagged != null && newAnalyzedTokens.isEmpty() ) {
compoundDebugLogger.logUnknownCompound(word + " " + animInanimNotTagged);
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}