in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/UkrainianTagger.java [77:214]
public List<AnalyzedToken> additionalTags(String word, WordTagger wordTagger) {
if ( NUMBER.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.number.getText(), word));
return additionalTaggedTokens;
}
if ( LATIN_NUMBER.matcher(word).matches() && !PATTERN_MD.matcher(word).matches()) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, "number:latin", word));
return additionalTaggedTokens;
}
if ( LATIN_NUMBER_CYR.matcher(word).matches() ) {
boolean ordinal = false;
int dashIdx = word.lastIndexOf('-');
if( dashIdx > 0 ) {
String left = word.substring(0, dashIdx);
String right = word.substring(dashIdx+1);
ordinal = LetterEndingForNumericHelper.isPossibleAdjAdjEnding(left, right);
}
if( dashIdx == -1 || ordinal ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, "number:latin:bad", word));
return additionalTaggedTokens;
}
}
if ( TIME.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.time.getText(), word));
return additionalTaggedTokens;
}
if ( DATE.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.date.getText(), word));
return additionalTaggedTokens;
}
if ( word.indexOf('(') > 0 || word.indexOf('/') > 0 ) {
Set<AnalyzedToken> newAnalyzedTokens = compoundTagger.generateEntities(word);
if (newAnalyzedTokens.size() > 0)
return new ArrayList<>(newAnalyzedTokens);
}
if ( word.startsWith("#") && HASHTAG.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.hashtag.getText(), word));
return additionalTaggedTokens;
}
if ( word.length() > 5 && CAPS_INSIDE_WORD.matcher(word).matches() ) {
List<TaggedWord> wdList = wordTagger.tag(word.toLowerCase());
if( wdList.size() > 0 ) {
wdList = PosTagHelper.adjust(wdList, null, null, ":alt");
return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
}
}
// помилка - «з» замість «с» перед губними
if ( word.length() > 5 && word.matches("(?iu)з[кптфх].+") ) {
String newWord = word.replaceFirst("^з", "с").replaceFirst("^З", "С");
List<TaggedWord> wdList = compoundTagger.tagBothCases(newWord, null);
if( wdList.size() > 0 ) {
wdList = wdList.stream()
.map(w -> new TaggedWord(w.getLemma().replaceFirst("^с", "з").replaceFirst("^С", "З"), PosTagHelper.addIfNotContains(w.getPosTag(), ":alt")))
.collect(Collectors.toList());
return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
}
}
// дївчина
if( word.length() > 3 && word.contains("ї") ) {
String word2 = YI_PATTERN.matcher(word).replaceAll("$1і");
List<TaggedWord> wdList = wordTagger.tag(word2);
if( wdList.size() > 0 ) {
wdList = PosTagHelper.adjust(wdList, null, null, ":alt");
return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
}
}
if ( word.length() > 4 ) {
Matcher matcher = MISSING_APO.matcher(word);
if (matcher.find()) {
List<TaggedWord> wdList = wordTagger.tag(matcher.replaceFirst("$1'$2"));
wdList = PosTagHelper.filter2(wdList, Pattern.compile("(?!.*:(bad|arch|alt|abbr|slang|subst|short|long)).*"));
if( wdList.size() > 0 ) {
wdList = wdList.stream()
.map(w -> new TaggedWord(w.getLemma(), PosTagHelper.addIfNotContains(w.getPosTag(), ":bad")))
.collect(Collectors.toList());
// wdList = PosTagHelper.adjust(wdList, null, null, ":bad");
return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
}
}
}
if ( word.length() > 5 ) {
Matcher matcher = MISSING_HYPHEN.matcher(word);
if (matcher.matches()) {
List<TaggedWord> wdList = wordTagger.tag(matcher.group(1).toLowerCase());
if( wdList.size() > 0 && PosTagHelper.hasPosTagPart2(wdList, "pron")) {
wdList = PosTagHelper.adjust(wdList, null, "-"+matcher.group(2).toLowerCase(), ":bad");
return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
}
}
}
word = Ukrainian.IGNORED_CHARS.matcher(word).replaceAll("");
if ( word.length() >= 3 && word.indexOf('-') > 0 ) {
// екс-«депутат»
// "заступницю"-колаборантку
if( word.length() >= 6 ) {
if (COMPOUND_WITH_QUOTES_REGEX.matcher(word).find()
|| COMPOUND_WITH_QUOTES_REGEX2.matcher(word).find()) {
String adjustedWord = QUOTES.matcher(word).replaceAll("");
return getAdjustedAnalyzedTokens(word, adjustedWord, null, null, null);
}
}
try {
List<AnalyzedToken> guessedCompoundTags = compoundTagger.guessCompoundTag(word);
return guessedCompoundTags;
}
catch(Exception e) {
logger.error("Failed to tag \"" + word + "\"", e);
return new ArrayList<>();
}
}
return compoundTagger.guessOtherTags(word);
}