in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/UkrainianTagger.java [217:390]
protected List<AnalyzedToken> getAnalyzedTokens(String word) {
if( word.indexOf('`') > 0 ) {
word = word.replace('`', '\'');
}
List<AnalyzedToken> tokens = super.getAnalyzedTokens(word);
if( word.length() < 2 )
return tokens;
if( tokens.get(0).hasNoTag() ) {
String origWord = word;
// if( word.lastIndexOf('м') == word.length()-2
// && word.matches("([ксмнд]|мк)?м[23²³]") ) {
//// word = origWord.substring(0, word.length()-1);
//// List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, Pattern.compile("noun:inanim.*"), null, null);
//// return newTokens.size() > 0 ? newTokens : tokens;
// return Arrays.asList(new AnalyzedToken(origWord, "noninfl", origWord));
// }
// if( word.matches("[0-9]+[а-яїієґa-z]") ) {
// return Arrays.asList(new AnalyzedToken(origWord, "noninfl", origWord));
// }
if( word.length() > 2 ) {
if( word.indexOf('\u2013') > 0
&& ALT_DASHES_IN_WORD.matcher(word).find() ) {
word = origWord.replace('\u2013', '-');
List<AnalyzedToken> newTokens = super.getAnalyzedTokens(word);
// List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, null, null, null);
if( newTokens.size() > 0 && ! newTokens.get(0).hasNoTag() ) {
newTokens.add(new AnalyzedToken(origWord, null, null));
tokens = newTokens;
}
}
// String lowerWord = word.toLowerCase();
// try г instead of ґ
else if( word.contains("ґ") || word.contains("Ґ") ) {
tokens = convertTokens(tokens, word, "ґ", "г", ":alt");
}
else if( word.contains("ія") ) {
tokens = convertTokens(tokens, word, "ія", "іа", ":alt");
}
else if( word.endsWith("тер") ) {
tokens = convertTokens(tokens, word, "тер", "тр", ":alt");
}
else if( word.contains("льо") ) {
tokens = convertTokens(tokens, word, "льо", "ло", ":alt");
}
else if( word.startsWith("сьвя") ) {
tokens = convertTokens(tokens, word, "сьвя", "свя", ":arch");
}
else if( word.startsWith("сьві") ) {
tokens = convertTokens(tokens, word, "сьві", "сві", ":arch");
}
else if( word.contains("ьск") && ! word.endsWith("ская") && ! word.equals("Комсомольском")) {
tokens = convertTokens(tokens, word, "ьск", "ьськ", ":bad");
}
if( tokens.get(0).hasNoTag() ) {
if ( word.length() >= 3 ) {
if ( word.length() >= 9 ) {
Matcher matcher2 = CompoundTagger.LEFT_O_ADJ_INVALID_PATTERN.matcher(word);
if (matcher2.matches()) {
String prefix = matcher2.group(1);
String adjustedWord = matcher2.group(2);
List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, Pattern.compile("^adj.*"), null,
(lemma) -> prefix + lemma);
if( ! newTokens.isEmpty() ) {
tokens = newTokens;
}
}
}
// гааа
if( tokens.get(0).hasNoTag()
&& ! word.equalsIgnoreCase("ііі") ) {// often stands for Latin number
Matcher matcher = Pattern.compile("([аеєиіїоуюя])\\1{2,}", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE).matcher(word);
if( matcher.find() ) {
String adjustedWord = matcher.replaceAll("$1");
List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, Pattern.compile("(?!noun.*:prop|.*abbr).*"), ":alt",
(lemma) -> lemma);
if( ! newTokens.isEmpty() ) {
tokens = newTokens;
}
}
}
if( tokens.get(0).hasNoTag()
&& word.contains("[") && word.contains("]")
&& UkrainianWordTokenizer.WORDS_WITH_BRACKETS_PATTERN.matcher(word).find() ) {
String adjustedWord = word.replace("[", "").replace("]", "");
List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, null, ":alt",
(lemma) -> lemma);
if( ! newTokens.isEmpty() ) {
tokens = newTokens;
}
}
}
}
}
}
// try УКРАЇНА as Україна and СИРІЮ as Сирію
if( word.length() > 2 && LemmaHelper.isAllUppercaseUk(word) ) {
String newWord = LemmaHelper.capitalizeProperName(word);
List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, newWord, Pattern.compile("noun.*?:prop.*|noninfl.*"), null, null);
if( newTokens.size() > 0 ) {
if( tokens.get(0).hasNoTag() ) {
//TODO: add special tags if necessary
tokens = newTokens;
}
else {
tokens.addAll(newTokens);
}
}
}
// Івано-Франківська as adj from івано-франківський
List<AnalyzedToken> analyzedTokens = analyzeAllCapitamizedAdj(word);
if( analyzedTokens.size() > 0 ) {
if( tokens.get(0).hasNoTag() ) {
tokens = analyzedTokens;
}
else {
// compound tagging has already been performed and may have added tokens
for(AnalyzedToken token: analyzedTokens) {
if( ! tokens.contains(token) ) {
tokens.add(token);
}
}
}
}
// бл*ть, нах#й
// приголосні: на#уй
// if( word.matches(".*[*#].*") ) {
// try {
// MorfologikUkrainianSpellerRule morfologikSpellerRule = (MorfologikUkrainianSpellerRule)Ukrainian.DEFAULT_VARIANT.getDefaultSpellingRule();
// Field field = morfologikSpellerRule.getClass().getSuperclass().getDeclaredField("speller1");
// field.setAccessible(true);
// MorfologikMultiSpeller speller1 = (MorfologikMultiSpeller) field.get(morfologikSpellerRule);
// Tagger tagger = Ukrainian.DEFAULT_VARIANT.getTagger();
// List<String> suggestions = speller1.getSuggestions(word);
// List<AnalyzedToken> tagged = suggestions.stream()
// .map(s -> {
// try {
// return tagger.tag(Arrays.asList(s)).get(0).getReadings();
// } catch (IOException e) {
// throw new RuntimeException(e);
// }
// })
// .filter(r -> PosTagHelper.hasPosTag(r, Pattern.compile(".*:(vulg|obsc).*")))
// .flatMap(Collection::stream)
// .collect(Collectors.toList());
//
// for(AnalyzedToken tagg:tagged) {
// tokens.add(new AnalyzedToken(word, tagg.getPOSTag() + ":alt", tagg.getLemma()));
// }
// }
// catch (Exception e) {
// logger.warn("Failed to tag {}", word);
// }
// }
return tokens;
}