protected List getAnalyzedTokens()

in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/UkrainianTagger.java [217:390]


  protected List<AnalyzedToken> getAnalyzedTokens(String word) {
    
    if( word.indexOf('`') > 0 ) {
      word = word.replace('`', '\'');
    }
    
    List<AnalyzedToken> tokens = super.getAnalyzedTokens(word);

    if( word.length() < 2 )
      return tokens;
    
    if( tokens.get(0).hasNoTag() ) {
      String origWord = word;

//      if( word.lastIndexOf('м') == word.length()-2 
//          && word.matches("([ксмнд]|мк)?м[23²³]") ) {
////        word = origWord.substring(0, word.length()-1);
////        List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, Pattern.compile("noun:inanim.*"), null, null);
////        return newTokens.size() > 0 ? newTokens : tokens;
//        return Arrays.asList(new AnalyzedToken(origWord, "noninfl", origWord));
//      }

//      if( word.matches("[0-9]+[а-яїієґa-z]") ) {
//        return Arrays.asList(new AnalyzedToken(origWord, "noninfl", origWord));
//      }

      if( word.length() > 2 ) {
        if( word.indexOf('\u2013') > 0
            && ALT_DASHES_IN_WORD.matcher(word).find() ) {

          word = origWord.replace('\u2013', '-');

          List<AnalyzedToken> newTokens = super.getAnalyzedTokens(word);
//          List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, null, null, null);

          if( newTokens.size() > 0 && ! newTokens.get(0).hasNoTag() ) {
            newTokens.add(new AnalyzedToken(origWord, null, null));
            tokens = newTokens;
          }
        }

//        String lowerWord = word.toLowerCase();
        
        // try г instead of ґ
        else if( word.contains("ґ") || word.contains("Ґ") ) {
          tokens = convertTokens(tokens, word, "ґ", "г", ":alt");
        }
        else if( word.contains("ія") ) {
          tokens = convertTokens(tokens, word, "ія", "іа", ":alt");
        }
        else if( word.endsWith("тер") ) {
          tokens = convertTokens(tokens, word, "тер", "тр", ":alt");
        }
        else if( word.contains("льо") ) {
          tokens = convertTokens(tokens, word, "льо", "ло", ":alt");
        }
        else if( word.startsWith("сьвя") ) {
          tokens = convertTokens(tokens, word, "сьвя", "свя", ":arch");
        }
        else if( word.startsWith("сьві") ) {
          tokens = convertTokens(tokens, word, "сьві", "сві", ":arch");
        }
        else if( word.contains("ьск") && ! word.endsWith("ская") && ! word.equals("Комсомольском")) {
          tokens = convertTokens(tokens, word, "ьск", "ьськ", ":bad");
        }

        if( tokens.get(0).hasNoTag() ) {
          if ( word.length() >= 3 ) {
            if ( word.length() >= 9 ) {
              Matcher matcher2 = CompoundTagger.LEFT_O_ADJ_INVALID_PATTERN.matcher(word);
              if (matcher2.matches()) {
                String prefix = matcher2.group(1);
                String adjustedWord = matcher2.group(2);
                List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, Pattern.compile("^adj.*"), null,
                    (lemma) -> prefix + lemma);
                if( ! newTokens.isEmpty() ) {
                  tokens = newTokens;
                }
              }
            }
            // гааа
            if( tokens.get(0).hasNoTag()
                && ! word.equalsIgnoreCase("ііі") ) {// often stands for Latin number
              Matcher matcher = Pattern.compile("([аеєиіїоуюя])\\1{2,}", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE).matcher(word);
              if( matcher.find() ) {
                String adjustedWord = matcher.replaceAll("$1");
                List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, Pattern.compile("(?!noun.*:prop|.*abbr).*"), ":alt",
                    (lemma) -> lemma);
                if( ! newTokens.isEmpty() ) {
                  tokens = newTokens;
                }
              }
            }
            if( tokens.get(0).hasNoTag() 
                && word.contains("[") && word.contains("]")
                && UkrainianWordTokenizer.WORDS_WITH_BRACKETS_PATTERN.matcher(word).find() ) {
              String adjustedWord = word.replace("[", "").replace("]", "");
              List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, adjustedWord, null, ":alt",
                  (lemma) -> lemma);
              if( ! newTokens.isEmpty() ) {
                tokens = newTokens;
              }
            }
          }
        }
      }
    }

    // try УКРАЇНА as Україна and СИРІЮ as Сирію
    if( word.length() > 2 && LemmaHelper.isAllUppercaseUk(word) ) {

      String newWord = LemmaHelper.capitalizeProperName(word);

      List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, newWord, Pattern.compile("noun.*?:prop.*|noninfl.*"), null, null);
      if( newTokens.size() > 0 ) {
          if( tokens.get(0).hasNoTag() ) {
            //TODO: add special tags if necessary
            tokens = newTokens;
          }
          else {
            tokens.addAll(newTokens);
          }
        }
    }

    // Івано-Франківська as adj from івано-франківський
    List<AnalyzedToken> analyzedTokens = analyzeAllCapitamizedAdj(word);
    if( analyzedTokens.size() > 0 ) {
      if( tokens.get(0).hasNoTag() ) {
        tokens = analyzedTokens;
      }
      else {
        // compound tagging has already been performed and may have added tokens
        for(AnalyzedToken token: analyzedTokens) {
          if( ! tokens.contains(token) ) {
            tokens.add(token);
          }
        }
      }
    }
    
    // бл*ть, нах#й
    // приголосні: на#уй
//    if( word.matches(".*[*#].*") ) {
//      try {
//        MorfologikUkrainianSpellerRule morfologikSpellerRule = (MorfologikUkrainianSpellerRule)Ukrainian.DEFAULT_VARIANT.getDefaultSpellingRule();
//        Field field = morfologikSpellerRule.getClass().getSuperclass().getDeclaredField("speller1");
//        field.setAccessible(true);
//        MorfologikMultiSpeller speller1 = (MorfologikMultiSpeller) field.get(morfologikSpellerRule);
//        Tagger tagger = Ukrainian.DEFAULT_VARIANT.getTagger();
//        List<String> suggestions = speller1.getSuggestions(word);
//        List<AnalyzedToken> tagged = suggestions.stream()
//            .map(s -> {
//              try {
//                return tagger.tag(Arrays.asList(s)).get(0).getReadings();
//              } catch (IOException e) {
//                throw new RuntimeException(e);
//              }
//            })
//            .filter(r -> PosTagHelper.hasPosTag(r, Pattern.compile(".*:(vulg|obsc).*")))
//            .flatMap(Collection::stream)
//            .collect(Collectors.toList());
//        
//        for(AnalyzedToken tagg:tagged) {
//          tokens.add(new AnalyzedToken(word, tagg.getPOSTag() + ":alt", tagg.getLemma()));
//        }
//      }
//      catch (Exception e) {
//        logger.warn("Failed to tag {}", word);
//      }
//    }

    return tokens;
  }