public List additionalTags()

in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/UkrainianTagger.java [77:214]


  public List<AnalyzedToken> additionalTags(String word, WordTagger wordTagger) {
    if ( NUMBER.matcher(word).matches() ) {
      List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
      additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.number.getText(), word));
      return additionalTaggedTokens;
    }

    if ( LATIN_NUMBER.matcher(word).matches() && !PATTERN_MD.matcher(word).matches()) {
      List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
      additionalTaggedTokens.add(new AnalyzedToken(word, "number:latin", word));
      return additionalTaggedTokens;
    }

    if ( LATIN_NUMBER_CYR.matcher(word).matches() ) {

      boolean ordinal = false;
      int dashIdx = word.lastIndexOf('-');
      if( dashIdx > 0 ) {
        String left = word.substring(0, dashIdx);
        String right = word.substring(dashIdx+1);
        ordinal = LetterEndingForNumericHelper.isPossibleAdjAdjEnding(left, right);
      }
      
      if( dashIdx == -1 || ordinal ) {
        List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
        additionalTaggedTokens.add(new AnalyzedToken(word, "number:latin:bad", word));
        return additionalTaggedTokens;
      }
    }

    if ( TIME.matcher(word).matches() ) {
        List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
        additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.time.getText(), word));
        return additionalTaggedTokens;
    }

    if ( DATE.matcher(word).matches() ) {
      List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
      additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.date.getText(), word));
      return additionalTaggedTokens;
    }

    if ( word.indexOf('(') > 0 || word.indexOf('/') > 0 ) {
      Set<AnalyzedToken> newAnalyzedTokens = compoundTagger.generateEntities(word);

      if (newAnalyzedTokens.size() > 0)
        return new ArrayList<>(newAnalyzedTokens);
    }
    
    if ( word.startsWith("#") && HASHTAG.matcher(word).matches() ) {
      List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
      additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.hashtag.getText(), word));
      return additionalTaggedTokens;
    }

    if ( word.length() > 5 && CAPS_INSIDE_WORD.matcher(word).matches() ) {
      List<TaggedWord> wdList = wordTagger.tag(word.toLowerCase());
      if( wdList.size() > 0 ) {
        wdList = PosTagHelper.adjust(wdList, null, null, ":alt");
        return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
      }
    }

    // помилка - «з» замість «с» перед губними
    if ( word.length() > 5 && word.matches("(?iu)з[кптфх].+") ) {
      String newWord = word.replaceFirst("^з", "с").replaceFirst("^З", "С");
      List<TaggedWord> wdList = compoundTagger.tagBothCases(newWord, null);
      if( wdList.size() > 0 ) {
          wdList = wdList.stream()
              .map(w -> new TaggedWord(w.getLemma().replaceFirst("^с", "з").replaceFirst("^С", "З"), PosTagHelper.addIfNotContains(w.getPosTag(), ":alt")))
              .collect(Collectors.toList());
          return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
      }
    }

    // дївчина
    if( word.length() > 3 && word.contains("ї") ) {
      String word2 = YI_PATTERN.matcher(word).replaceAll("$1і");
      List<TaggedWord> wdList = wordTagger.tag(word2);
      if( wdList.size() > 0 ) {
        wdList = PosTagHelper.adjust(wdList, null, null, ":alt");
        return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
      }
    }


    if ( word.length() > 4 ) {
      Matcher matcher = MISSING_APO.matcher(word);
      if (matcher.find()) {
        List<TaggedWord> wdList = wordTagger.tag(matcher.replaceFirst("$1'$2"));
        wdList = PosTagHelper.filter2(wdList, Pattern.compile("(?!.*:(bad|arch|alt|abbr|slang|subst|short|long)).*"));
        if( wdList.size() > 0 ) {
          wdList = wdList.stream()
              .map(w -> new TaggedWord(w.getLemma(), PosTagHelper.addIfNotContains(w.getPosTag(), ":bad")))
              .collect(Collectors.toList());
//          wdList = PosTagHelper.adjust(wdList, null, null, ":bad");
          return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
        }
      }
    }

    if ( word.length() > 5 ) {
      Matcher matcher = MISSING_HYPHEN.matcher(word);
      if (matcher.matches()) {
        List<TaggedWord> wdList = wordTagger.tag(matcher.group(1).toLowerCase());
        if( wdList.size() > 0 && PosTagHelper.hasPosTagPart2(wdList, "pron")) {
          wdList = PosTagHelper.adjust(wdList, null, "-"+matcher.group(2).toLowerCase(), ":bad");
          return asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
        }
      }
    }

    word = Ukrainian.IGNORED_CHARS.matcher(word).replaceAll("");
    
    if ( word.length() >= 3 && word.indexOf('-') > 0 ) {

      // екс-«депутат»
      // "заступницю"-колаборантку
      if( word.length() >= 6 ) {
        if (COMPOUND_WITH_QUOTES_REGEX.matcher(word).find()
            || COMPOUND_WITH_QUOTES_REGEX2.matcher(word).find()) {
          String adjustedWord = QUOTES.matcher(word).replaceAll("");
          return getAdjustedAnalyzedTokens(word, adjustedWord, null, null, null);
        }
      }

      try {
        List<AnalyzedToken> guessedCompoundTags = compoundTagger.guessCompoundTag(word);
        return guessedCompoundTags;
      }
      catch(Exception e) {
        logger.error("Failed to tag \"" + word + "\"", e);
        return new ArrayList<>();
      }
    }
    
    return compoundTagger.guessOtherTags(word);
  }