in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/CompoundTagger.java [174:597]
private List<AnalyzedToken> doGuessCompoundTag(String word) {
int dashIdx = word.lastIndexOf('-');
if( dashIdx == word.length() - 1 )
return null;
int firstDashIdx = word.indexOf('-');
if( firstDashIdx == 0 )
return null;
boolean startsWithDigit = Character.isDigit(word.charAt(0));
if( ! startsWithDigit && dashIdx != firstDashIdx ) {
int dashCount = StringUtils.countMatches(word, "-");
if( dashCount >= 2
&& dashIdx > firstDashIdx + 1 ) {
List<AnalyzedToken> tokens = doGuessMultiHyphens(word, firstDashIdx, dashIdx);
if( tokens != null )
return tokens;
}
if( dashCount == 2
&& dashIdx > firstDashIdx + 1 ) {
return doGuessTwoHyphens(word, firstDashIdx, dashIdx);
}
return null;
}
String leftWord = word.substring(0, dashIdx);
String rightWord = word.substring(dashIdx + 1);
String leftWordLowerCase = leftWord.toLowerCase(conversionLocale);
// з-зателефоную
if( leftWord.length() == 1 && rightWord.length() > 3 && rightWord.startsWith(leftWordLowerCase) ) {
List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
rightWdList = PosTagHelper.adjust(rightWdList, null, null, ":alt");
return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, rightWdList);
}
boolean dashPrefixMatch = dashPrefixes.containsKey( leftWord )
|| dashPrefixes.containsKey( leftWordLowerCase )
|| DASH_PREFIX_LAT_PATTERN.matcher(leftWord).matches();
if( ! dashPrefixMatch
&& (startsWithDigit || word.matches("[XLIV]+-.*")) ) {
return matchDigitCompound(word, leftWord, rightWord);
}
if( Character.isDigit(rightWord.charAt(0)) ) {
return matchNumberedProperNoun(word, leftWord, rightWord);
}
// авіа..., авто... пишуться разом
//TODO: але може бути: авто-пенсіонер
if( dashPrefixesInvalid.contains(leftWordLowerCase) ) {
List<TaggedWord> rightWdList = tagEitherCase(rightWord);
rightWdList = PosTagHelper.filter2(rightWdList, Pattern.compile("(noun|adj)(?!.*pron).*"));
if( rightWdList.isEmpty() )
return null;
// String lemma = leftWord + "-" + rightWdList.get(0).getLemma();
String extraTag = StringTools.isCapitalizedWord(rightWord) ? "" : ":bad";
rightWdList = PosTagHelper.adjust(rightWdList, leftWord + "-", null, extraTag);
return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, rightWdList);
}
// wrong: пів-качана
if( leftWordLowerCase.equals("пів")
&& Character.isLowerCase(rightWord.charAt(0)) ) {
List<TaggedWord> rightWdList = tagEitherCase(rightWord);
List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
List<AnalyzedToken> newAnalyzedTokens = addPluralNvTokens(word, rightAnalyzedTokens, ":bad");
return newAnalyzedTokens;
}
List<TaggedWord> leftWdList = tagAsIsAndWithLowerCase(leftWord);
// стривай-бо, чекай-но, прийшов-таки, такий-от, такий-то, ішов-єм (arch)
String rightWordLowerCase = rightWord.toLowerCase();
if( rightPartsWithLeftTagMap.containsKey(rightWordLowerCase)
&& ! PosTagHelper.hasPosTagPart2(leftWdList, "abbr") ) {
if( leftWdList.isEmpty() )
return null;
Pattern leftTagRegex = rightPartsWithLeftTagMap.get(rightWordLowerCase);
List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size());
// ignore хто-то
if( rightWordLowerCase.equals("то")
&& LemmaHelper.hasLemma(leftAnalyzedTokens, Arrays.asList("хто", "що", "чи")) )
return null;
for (AnalyzedToken analyzedToken : leftAnalyzedTokens) {
String posTag = analyzedToken.getPOSTag();
if (leftWord.equalsIgnoreCase("як") && posTag != null && posTag.contains("noun") )
continue;
if( posTag != null
&& (leftWordLowerCase.equals("дуже") && posTag.contains("adv"))
|| (leftTagRegex.matcher(posTag).matches()) ) {
if( rightWord.equals("єм") ) {
posTag = PosTagHelper.addIfNotContains(posTag, ":arch");
}
newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma()));
}
}
return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
}
// по-болгарськи, по-болгарському
if( leftWord.equalsIgnoreCase("по") && SKY_PATTERN.matcher(rightWord).matches() ) {
rightWord += "й";
}
// Пенсильванія-авеню
if( Character.isUpperCase(leftWord.charAt(0)) && LemmaHelper.CITY_AVENU.contains(rightWordLowerCase) ) {
String addPos = rightWord.equals("штрассе") ? ":alt" : "";
return PosTagHelper.generateTokensForNv(word, "f", ":prop" + addPos);
}
// Fe-вмісний
if( rightWordLowerCase.startsWith("вмісн") ) {
String adjustedWord = "боро" + rightWord;
List<TaggedWord> rightWdList = tagEitherCase(adjustedWord);
rightWdList = rightWdList.stream().map(wd -> new TaggedWord("вмісний", wd.getPosTag())).collect(Collectors.toList());
List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
return generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, IPOSTag.adj.getText(), null, Pattern.compile(":comp."));
}
List<TaggedWord> rightWdList = tagEitherCase(rightWord);
if( word.toLowerCase().startsWith("напів") ) {
// напівпольської-напіванглійської
Matcher napivMatcher = Pattern.compile("напів(.+?)-напів(.+)").matcher(word);
if( napivMatcher.matches() ) {
List<TaggedWord> napivLeftWdList = PosTagHelper.adjust(tagAsIsAndWithLowerCase(napivMatcher.group(1)), "напів", null);
List<TaggedWord> napivRightWdList = rightWdList.size() > 0 ? rightWdList : PosTagHelper.adjust(tagAsIsAndWithLowerCase(napivMatcher.group(2)), "напів", null);
if( napivLeftWdList.isEmpty() || napivRightWdList.isEmpty() )
return null;
List<AnalyzedToken> napivLeftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(napivMatcher.group(1), napivLeftWdList);
List<AnalyzedToken> napivRightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(napivMatcher.group(2), napivRightWdList);
List<AnalyzedToken> tagMatch = tagMatch(word, napivLeftAnalyzedTokens, napivRightAnalyzedTokens);
if( tagMatch != null ) {
return tagMatch;
}
}
}
Pattern TAGS_TO_REMOVE = Pattern.compile(":comp.|:predic|:insert");
List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
// гірко-прегірко
if( rightWord.startsWith("пре") && leftWordLowerCase.equals(rightWord.substring(3).toLowerCase()) ) {
if (PosTagHelper.hasPosTagStart2(leftWdList, "adv")) {
return leftAnalyzedTokens.stream()
.filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("adv") )
.map(a -> new AnalyzedToken(word, TAGS_TO_REMOVE.matcher(a.getPOSTag()).replaceAll(""), word))
.collect(Collectors.toList());
}
// гіркий-прегіркий
else if( PosTagHelper.hasPosTagStart2(leftWdList, "adj") ) {
return leftAnalyzedTokens.stream()
.filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("adj") )
.map(a -> new AnalyzedToken(word, TAGS_TO_REMOVE.matcher(a.getPOSTag()).replaceAll(""), a.getLemma()+"-пре"+a.getLemma()))
.collect(Collectors.toList());
}
}
// Мустафа-ага
if( NAME_SUFFIX.contains(rightWord)
&& PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "name") ) {
List<TaggedWord> wordList = PosTagHelper.adjust(leftWdList, null, "-" + rightWord);
return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, wordList);
}
if( leftWord.equals("аль") ) {
String wd = "Аль-" + rightWord;
List<TaggedWord> wdList = wordTagger.tag(wd);
if( wdList.size() > 0 ) {
wdList = PosTagHelper.adjust(wdList, null, null, ":bad");
return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(wd, wdList);
}
}
if( rightWdList.isEmpty() ) {
return null;
}
List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
// півгодини-годину
if( word.startsWith("пів")
&& PosTagHelper.hasPosTag(leftAnalyzedTokens, Pattern.compile("noun:inanim:p:v_...:nv.*")) ) {
return rightAnalyzedTokens.stream()
.filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("noun:inanim:") )
.map(a -> new AnalyzedToken(word, a.getPOSTag().replaceFirst(":[mfn]:", ":p:"), word))
.collect(Collectors.toList());
}
if( leftWord.equalsIgnoreCase("по") ) {
if( rightWord.endsWith("ому") ) {
return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_MIS);
}
else if( SKYI_PATTERN.matcher(rightWord).matches() ) {
return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_NAZ);
}
return null;
}
if( Character.isUpperCase(leftWord.charAt(0)) && Character.isUpperCase(rightWord.charAt(0)) ) {
// Київ-Прага
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, GEO_V_NAZ)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, GEO_V_NAZ) ) {
return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:geo", word));
}
// Хуана-Карлоса
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, FNAME)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, FNAME) ) {
leftAnalyzedTokens = PosTagHelper.filter(leftAnalyzedTokens, Pattern.compile(".*fname.*"));
rightAnalyzedTokens = PosTagHelper.filter(rightAnalyzedTokens, Pattern.compile(".*fname.*"));
return tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
}
// подружжя Карпа-Хансен
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, LNAME_V_NAZ)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, LNAME_V_NAZ) ) {
return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:lname", word));
}
// Джеймса-Веніка
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, LNAME_V_ROD)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, LNAME_V_ROD) ) {
return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:lname", word));
}
// bad: Квітки-Основ'яненко
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, NAME)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, NAME) ) {
return null;
}
// Україна-ЄС
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, PROP_V_NAZ)
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, PROP_V_NAZ) ) {
return Arrays.asList(new AnalyzedToken(word, "noninfl:prop", word));
}
}
// exclude: Малишко-це, відносини-коли
// List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
// був-би, but not м-б
if( leftWord.length() > 1 && BAD_SUFFIX.contains(rightWord) ) {
List<TaggedWord> wordList = PosTagHelper.adjust(leftWdList, null, "-" + rightWord);
wordList = PosTagHelper.addIfNotContains(leftWdList, ":bad", null);
List<AnalyzedToken> tagged = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, wordList);
return tagged;
}
if( leftWord.equalsIgnoreCase(rightWord)
&& leftAnalyzedTokens.size() > 0
&& LemmaHelper.hasLemma(leftAnalyzedTokens, Pattern.compile("[ув]?весь|[ву]с[еі]")) ) {
List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
if( tagMatch != null ) {
return tagMatch.stream()
.filter(m -> equalParts(m.getLemma()) )
.collect(Collectors.toList());
}
}
if( PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "pron")
&& ! PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "numr") )
return null;
if( ! leftWord.equalsIgnoreCase(rightWord) && PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("(part|conj).*|.*?:pron.*"))
&& ! (PosTagHelper.hasPosTagStart(leftAnalyzedTokens, "numr") && PosTagHelper.hasPosTagStart(rightAnalyzedTokens, "numr")) )
return null;
List<AnalyzedToken> adjCompounds = new ArrayList<>();
if( leftWord.matches("[А-ЯІЇЄҐa-zA-Zα-ωΑ-Ω]|[a-zA-Z-]+") ) {
if( PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("adj(?!.*(pron|bad|slang|arch)).*")) ) {
adjCompounds = generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, IPOSTag.adj.getText(), null, Pattern.compile(":comp."));
}
}
// майстер-класу
if( dashPrefixMatch
&& ! ( leftWord.equalsIgnoreCase("міді") && LemmaHelper.hasLemma(rightAnalyzedTokens, Arrays.asList("бронза"))) ) {
List<AnalyzedToken> newTokens = new ArrayList<>();
// if( leftWord.length() == 1 && leftWord.matches("[a-zA-Zα-ωΑ-Ω]") ) {
// List<AnalyzedToken> newTokensAdj = getNvPrefixLatWithAdjMatch(word, rightAnalyzedTokens, leftWord);
// if( newTokensAdj != null ) {
// newTokens.addAll(newTokensAdj);
// }
// }
String extraTag = "";
boolean lowerCased = false;
if( dashPrefixes.containsKey( leftWord ) ) {
extraTag = dashPrefixes.get(leftWord);
}
else {
if( dashPrefixes.containsKey( leftWordLowerCase ) ) {
extraTag = dashPrefixes.get(leftWordLowerCase);
if( leftWordLowerCase.matches("[а-яіїєґ']+") ) { // Інтернет-пошуковик
lowerCased = true;
}
}
}
List<AnalyzedToken> newTokensNoun = getNvPrefixNounMatch(word, rightAnalyzedTokens, lowerCased ? leftWordLowerCase : leftWord, extraTag);
if( newTokensNoun != null ) {
newTokens.addAll(newTokensNoun);
}
// топ-десять
if( leftWord.equalsIgnoreCase("топ") && PosTagHelper.hasPosTagPart(rightAnalyzedTokens, "numr:") ) {
return generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, "numr:", ":bad", null);
}
if( newTokens.isEmpty() ) {
newTokens.addAll(adjCompounds);
}
return newTokens;
}
if( adjCompounds.size() > 0 )
return adjCompounds;
// пів-України
if( Character.isUpperCase(rightWord.charAt(0)) ) {
if (word.startsWith("пів-")) {
List<AnalyzedToken> newAnalyzedTokens = addPluralNvTokens(word, rightAnalyzedTokens, ":up92");
return newAnalyzedTokens;
}
else {
// we don't want Нью-Париж but want Австрійсько-Карпатський
if( StringTools.isCapitalizedWord(rightWord)
|| leftWord.endsWith("о")
|| PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("adj.*")) ) {
// tag Чорноморське/noun і чорноморське adj
List<TaggedWord> rightWdList2 = tagAsIsAndWithLowerCase(rightWord);
List<AnalyzedToken> rightAnalyzedTokens2 = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList2);
List<AnalyzedToken> match = tryOWithAdj(word, leftWord, rightAnalyzedTokens2);
if( match != null )
return match;
}
// Жінка-Актриса
if( PosTagHelper.hasPosTag(leftAnalyzedTokens, Pattern.compile("noun(?!.prop).*"))
&& PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("noun(?!.prop).*")) ) {
// flow-through
}
else {
return null;
}
}
}
// don't allow: Донець-кий, зовнішньо-економічний, мас-штаби
// allow га-га!
List<AnalyzedToken> noDashAnalyzedTokens = new ArrayList<>();
boolean hasIntj = PosTagHelper.hasPosTagStart(leftAnalyzedTokens, "intj");
if( ! hasIntj ) {
String noDashWord = word.replace("-", "");
List<TaggedWord> noDashWordList = tagAsIsAndWithLowerCase(noDashWord);
noDashAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(noDashWord, noDashWordList);
}
// вгору-вниз, лікар-гомеопат, жило-було
if( noDashAnalyzedTokens.isEmpty() ) {
if( ! leftWdList.isEmpty() && (leftWord.length() > 2 || hasIntj) ) {
List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
if( tagMatch != null ) {
return tagMatch;
}
}
}
List<AnalyzedToken> match = tryOWithAdj(word, leftWord, rightAnalyzedTokens);
if( match != null )
return match;
compoundDebugLogger.logUnknownCompound(word);
return null;
}