private List doGuessCompoundTag()

in languagetool-language-modules/uk/src/main/java/org/languagetool/tagging/uk/CompoundTagger.java [174:597]


  private List<AnalyzedToken> doGuessCompoundTag(String word) {
    int dashIdx = word.lastIndexOf('-');
    if( dashIdx == word.length() - 1 )
      return null;

    int firstDashIdx = word.indexOf('-');
    if( firstDashIdx == 0 )
      return null;

    boolean startsWithDigit = Character.isDigit(word.charAt(0));

    if( ! startsWithDigit && dashIdx != firstDashIdx ) {
      int dashCount = StringUtils.countMatches(word, "-");

      if( dashCount >= 2
          && dashIdx > firstDashIdx + 1 ) {
        List<AnalyzedToken> tokens = doGuessMultiHyphens(word, firstDashIdx, dashIdx);
        if( tokens != null )
          return tokens;
      }
      
      if( dashCount == 2
          && dashIdx > firstDashIdx + 1 ) {
        return doGuessTwoHyphens(word, firstDashIdx, dashIdx);
      }
      
      return null;
    }

    String leftWord = word.substring(0, dashIdx);
    String rightWord = word.substring(dashIdx + 1);
    String leftWordLowerCase = leftWord.toLowerCase(conversionLocale);

    // з-зателефоную

    if( leftWord.length() == 1 && rightWord.length() > 3 && rightWord.startsWith(leftWordLowerCase) ) {
      List<TaggedWord> rightWdList = wordTagger.tag(rightWord);
      rightWdList = PosTagHelper.adjust(rightWdList, null, null, ":alt");
      return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, rightWdList);
    }
    
    
    boolean dashPrefixMatch = dashPrefixes.containsKey( leftWord ) 
        || dashPrefixes.containsKey( leftWordLowerCase ) 
        || DASH_PREFIX_LAT_PATTERN.matcher(leftWord).matches();

    if( ! dashPrefixMatch 
        && (startsWithDigit || word.matches("[XLIV]+-.*")) ) {
      return matchDigitCompound(word, leftWord, rightWord);
    }

    if( Character.isDigit(rightWord.charAt(0)) ) {
      return matchNumberedProperNoun(word, leftWord, rightWord);
    }


    // авіа..., авто... пишуться разом
    //TODO: але може бути: авто-пенсіонер
    if( dashPrefixesInvalid.contains(leftWordLowerCase) ) {
      List<TaggedWord> rightWdList = tagEitherCase(rightWord);
      
      rightWdList = PosTagHelper.filter2(rightWdList, Pattern.compile("(noun|adj)(?!.*pron).*"));
      
      if( rightWdList.isEmpty() )
        return null;

//      String lemma = leftWord + "-" + rightWdList.get(0).getLemma();
      String extraTag = StringTools.isCapitalizedWord(rightWord) ? "" : ":bad";
      rightWdList = PosTagHelper.adjust(rightWdList, leftWord + "-", null, extraTag);
      return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, rightWdList);
    }


    // wrong: пів-качана
    if( leftWordLowerCase.equals("пів")
        && Character.isLowerCase(rightWord.charAt(0)) ) {

      List<TaggedWord> rightWdList = tagEitherCase(rightWord);
      List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);

      List<AnalyzedToken> newAnalyzedTokens = addPluralNvTokens(word, rightAnalyzedTokens, ":bad");
      return newAnalyzedTokens;
    }

    List<TaggedWord> leftWdList = tagAsIsAndWithLowerCase(leftWord);


    // стривай-бо, чекай-но, прийшов-таки, такий-от, такий-то, ішов-єм (arch)

    String rightWordLowerCase = rightWord.toLowerCase();
    if( rightPartsWithLeftTagMap.containsKey(rightWordLowerCase) 
        && ! PosTagHelper.hasPosTagPart2(leftWdList, "abbr") ) {

      if( leftWdList.isEmpty() )
        return null;

      Pattern leftTagRegex = rightPartsWithLeftTagMap.get(rightWordLowerCase);

      List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
      List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size());

      // ignore хто-то
      if( rightWordLowerCase.equals("то")
          && LemmaHelper.hasLemma(leftAnalyzedTokens, Arrays.asList("хто", "що", "чи")) )
        return null;

      for (AnalyzedToken analyzedToken : leftAnalyzedTokens) {
        String posTag = analyzedToken.getPOSTag();
        if (leftWord.equalsIgnoreCase("як") && posTag != null && posTag.contains("noun") )
          continue;
          
        if( posTag != null
            && (leftWordLowerCase.equals("дуже") && posTag.contains("adv")) 
             || (leftTagRegex.matcher(posTag).matches()) ) {
          
          if( rightWord.equals("єм") ) {
            posTag = PosTagHelper.addIfNotContains(posTag, ":arch");
          }
          
          newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma()));
        }
      }

      return newAnalyzedTokens.isEmpty() ? null : newAnalyzedTokens;
    }


    // по-болгарськи, по-болгарському

    if( leftWord.equalsIgnoreCase("по") && SKY_PATTERN.matcher(rightWord).matches() ) {
      rightWord += "й";
    }
    
    // Пенсильванія-авеню

    if( Character.isUpperCase(leftWord.charAt(0)) && LemmaHelper.CITY_AVENU.contains(rightWordLowerCase) ) {
      String addPos = rightWord.equals("штрассе") ? ":alt" : "";
      return PosTagHelper.generateTokensForNv(word, "f", ":prop" + addPos);
    }

    // Fe-вмісний
    if( rightWordLowerCase.startsWith("вмісн") ) {
      String adjustedWord = "боро" + rightWord;
      List<TaggedWord> rightWdList = tagEitherCase(adjustedWord);
      rightWdList = rightWdList.stream().map(wd -> new TaggedWord("вмісний", wd.getPosTag())).collect(Collectors.toList());
      List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);
      return generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, IPOSTag.adj.getText(), null, Pattern.compile(":comp."));
    }

    List<TaggedWord> rightWdList = tagEitherCase(rightWord);
      
     
    if( word.toLowerCase().startsWith("напів") ) {
      // напівпольської-напіванглійської
      Matcher napivMatcher = Pattern.compile("напів(.+?)-напів(.+)").matcher(word);
      if( napivMatcher.matches() ) {
        List<TaggedWord> napivLeftWdList = PosTagHelper.adjust(tagAsIsAndWithLowerCase(napivMatcher.group(1)), "напів", null);
        List<TaggedWord> napivRightWdList = rightWdList.size() > 0 ? rightWdList : PosTagHelper.adjust(tagAsIsAndWithLowerCase(napivMatcher.group(2)), "напів", null);

        if( napivLeftWdList.isEmpty() || napivRightWdList.isEmpty() )
          return null;
        
        List<AnalyzedToken> napivLeftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(napivMatcher.group(1), napivLeftWdList);
        List<AnalyzedToken> napivRightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(napivMatcher.group(2), napivRightWdList);

        List<AnalyzedToken> tagMatch = tagMatch(word, napivLeftAnalyzedTokens, napivRightAnalyzedTokens);
        if( tagMatch != null ) {
          return tagMatch;
        }
      }
    }

    Pattern TAGS_TO_REMOVE = Pattern.compile(":comp.|:predic|:insert");
    List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);
    
    // гірко-прегірко
    if( rightWord.startsWith("пре") && leftWordLowerCase.equals(rightWord.substring(3).toLowerCase()) ) {
      if (PosTagHelper.hasPosTagStart2(leftWdList, "adv")) {

        return leftAnalyzedTokens.stream()
            .filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("adv") )
            .map(a -> new AnalyzedToken(word, TAGS_TO_REMOVE.matcher(a.getPOSTag()).replaceAll(""), word))
            .collect(Collectors.toList());
      }
      // гіркий-прегіркий
      else if( PosTagHelper.hasPosTagStart2(leftWdList, "adj") ) {

        return leftAnalyzedTokens.stream()
            .filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("adj") )
            .map(a -> new AnalyzedToken(word, TAGS_TO_REMOVE.matcher(a.getPOSTag()).replaceAll(""), a.getLemma()+"-пре"+a.getLemma()))
            .collect(Collectors.toList());
      }
    }

    // Мустафа-ага
    if( NAME_SUFFIX.contains(rightWord)
        && PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "name") ) {
      List<TaggedWord> wordList = PosTagHelper.adjust(leftWdList, null, "-" + rightWord);
      return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, wordList);
    }

    if( leftWord.equals("аль") ) {
      String wd = "Аль-" + rightWord;
      List<TaggedWord> wdList = wordTagger.tag(wd);
      if( wdList.size() > 0 ) {
        wdList = PosTagHelper.adjust(wdList, null, null, ":bad");
        return ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(wd, wdList);
      }
    }

    if( rightWdList.isEmpty() ) {
      return null;
    }

    List<AnalyzedToken> rightAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList);

    // півгодини-годину
    if( word.startsWith("пів") 
        && PosTagHelper.hasPosTag(leftAnalyzedTokens, Pattern.compile("noun:inanim:p:v_...:nv.*")) ) {
      
      return rightAnalyzedTokens.stream()
          .filter(a -> a.getPOSTag() != null && a.getPOSTag().startsWith("noun:inanim:") )
          .map(a -> new AnalyzedToken(word, a.getPOSTag().replaceFirst(":[mfn]:", ":p:"), word))
          .collect(Collectors.toList());
      
    }

    if( leftWord.equalsIgnoreCase("по") ) {
      if( rightWord.endsWith("ому") ) {
        return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_MIS);
      }
      else if( SKYI_PATTERN.matcher(rightWord).matches() ) {
        return poAdvMatch(word, rightAnalyzedTokens, ADJ_TAG_FOR_PO_ADV_NAZ);
      }
      return null;
    }

    if( Character.isUpperCase(leftWord.charAt(0)) && Character.isUpperCase(rightWord.charAt(0)) ) {  
        // Київ-Прага
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, GEO_V_NAZ)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, GEO_V_NAZ) ) {
          return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:geo", word));
        }
        // Хуана-Карлоса
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, FNAME)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, FNAME) ) {
          leftAnalyzedTokens = PosTagHelper.filter(leftAnalyzedTokens, Pattern.compile(".*fname.*"));
          rightAnalyzedTokens = PosTagHelper.filter(rightAnalyzedTokens, Pattern.compile(".*fname.*"));
          return tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
        }
        // подружжя Карпа-Хансен
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, LNAME_V_NAZ)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, LNAME_V_NAZ) ) {
          return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:lname", word));
        }
        // Джеймса-Веніка
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, LNAME_V_ROD)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, LNAME_V_ROD) ) {
          return Arrays.asList(new AnalyzedToken(word, "noninfl:prop:lname", word));
        }
        // bad: Квітки-Основ'яненко
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, NAME)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, NAME) ) {
          return null;
        }
        // Україна-ЄС
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, PROP_V_NAZ)
            && PosTagHelper.hasPosTag(rightAnalyzedTokens, PROP_V_NAZ) ) {
          return Arrays.asList(new AnalyzedToken(word, "noninfl:prop", word));
        }
    }

    // exclude: Малишко-це, відносини-коли

//    List<AnalyzedToken> leftAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(leftWord, leftWdList);

    // був-би, but not м-б
    if( leftWord.length() > 1 && BAD_SUFFIX.contains(rightWord) ) {
      List<TaggedWord> wordList = PosTagHelper.adjust(leftWdList, null, "-" + rightWord);
      wordList = PosTagHelper.addIfNotContains(leftWdList, ":bad", null);
      List<AnalyzedToken> tagged = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(word, wordList);
      return tagged;
    }

    if( leftWord.equalsIgnoreCase(rightWord)
        && leftAnalyzedTokens.size() > 0
        && LemmaHelper.hasLemma(leftAnalyzedTokens, Pattern.compile("[ув]?весь|[ву]с[еі]")) ) {
      List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
      if( tagMatch != null ) {
        return tagMatch.stream()
          .filter(m -> equalParts(m.getLemma()) )
          .collect(Collectors.toList());
      }
    }

    
    if( PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "pron")
        && ! PosTagHelper.hasPosTagPart(leftAnalyzedTokens, "numr") )
      return null;

    if( ! leftWord.equalsIgnoreCase(rightWord) && PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("(part|conj).*|.*?:pron.*")) 
        && ! (PosTagHelper.hasPosTagStart(leftAnalyzedTokens, "numr") && PosTagHelper.hasPosTagStart(rightAnalyzedTokens, "numr")) )
      return null;

    List<AnalyzedToken> adjCompounds = new ArrayList<>();
    if( leftWord.matches("[А-ЯІЇЄҐa-zA-Zα-ωΑ-Ω]|[a-zA-Z-]+") ) {
        if( PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("adj(?!.*(pron|bad|slang|arch)).*")) ) {
          adjCompounds = generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, IPOSTag.adj.getText(), null, Pattern.compile(":comp."));
        }
    }

    // майстер-класу
    
    if( dashPrefixMatch 
        && ! ( leftWord.equalsIgnoreCase("міді") && LemmaHelper.hasLemma(rightAnalyzedTokens, Arrays.asList("бронза"))) ) {

      List<AnalyzedToken> newTokens = new ArrayList<>();
//      if( leftWord.length() == 1 && leftWord.matches("[a-zA-Zα-ωΑ-Ω]") ) {
//        List<AnalyzedToken> newTokensAdj = getNvPrefixLatWithAdjMatch(word, rightAnalyzedTokens, leftWord);
//        if( newTokensAdj != null ) {
//          newTokens.addAll(newTokensAdj);
//        }
//      }
      
      String extraTag = "";
      boolean lowerCased = false;
      if( dashPrefixes.containsKey( leftWord ) ) {
        extraTag = dashPrefixes.get(leftWord);
      }
      else { 
        if( dashPrefixes.containsKey( leftWordLowerCase ) ) {
          extraTag = dashPrefixes.get(leftWordLowerCase);
          if( leftWordLowerCase.matches("[а-яіїєґ']+") ) { // Інтернет-пошуковик
            lowerCased = true;
          }
        }
      }
      
      List<AnalyzedToken> newTokensNoun = getNvPrefixNounMatch(word, rightAnalyzedTokens, lowerCased ? leftWordLowerCase : leftWord, extraTag);
      if( newTokensNoun != null ) {
        newTokens.addAll(newTokensNoun);
      }
      
      // топ-десять
      if( leftWord.equalsIgnoreCase("топ") && PosTagHelper.hasPosTagPart(rightAnalyzedTokens, "numr:") ) {
        return generateTokensWithRighInflected(word, leftWord, rightAnalyzedTokens, "numr:", ":bad", null);
      }

      if( newTokens.isEmpty() ) {
        newTokens.addAll(adjCompounds);
      }
      
      return newTokens;
    }

    if( adjCompounds.size() > 0 )
      return adjCompounds;
    
    // пів-України

    if( Character.isUpperCase(rightWord.charAt(0)) ) {
      if (word.startsWith("пів-")) {
        List<AnalyzedToken> newAnalyzedTokens = addPluralNvTokens(word, rightAnalyzedTokens, ":up92");
        return newAnalyzedTokens;
      }
      else {
        // we don't want Нью-Париж but want Австрійсько-Карпатський
        if( StringTools.isCapitalizedWord(rightWord)
            || leftWord.endsWith("о")
            || PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("adj.*")) ) {

          // tag Чорноморське/noun і чорноморське adj
          List<TaggedWord> rightWdList2 = tagAsIsAndWithLowerCase(rightWord);
          List<AnalyzedToken> rightAnalyzedTokens2 = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(rightWord, rightWdList2);

          List<AnalyzedToken> match = tryOWithAdj(word, leftWord, rightAnalyzedTokens2);
          if( match != null )
            return match;
        }

        // Жінка-Актриса
        if( PosTagHelper.hasPosTag(leftAnalyzedTokens, Pattern.compile("noun(?!.prop).*")) 
              && PosTagHelper.hasPosTag(rightAnalyzedTokens, Pattern.compile("noun(?!.prop).*")) ) {
            // flow-through
        }
        else {
          return null;
        }
      }
    }

    // don't allow: Донець-кий, зовнішньо-економічний, мас-штаби

    // allow га-га!

    List<AnalyzedToken> noDashAnalyzedTokens = new ArrayList<>();
    
    boolean hasIntj = PosTagHelper.hasPosTagStart(leftAnalyzedTokens, "intj");
    if( ! hasIntj ) {
      String noDashWord = word.replace("-", "");
      List<TaggedWord> noDashWordList = tagAsIsAndWithLowerCase(noDashWord);
      noDashAnalyzedTokens = ukrainianTagger.asAnalyzedTokenListForTaggedWordsInternal(noDashWord, noDashWordList);
    }


    // вгору-вниз, лікар-гомеопат, жило-було

    if( noDashAnalyzedTokens.isEmpty() ) {
      if( ! leftWdList.isEmpty() && (leftWord.length() > 2 || hasIntj) ) {
        List<AnalyzedToken> tagMatch = tagMatch(word, leftAnalyzedTokens, rightAnalyzedTokens);
        if( tagMatch != null ) {
          return tagMatch;
        }
      }
    }

    List<AnalyzedToken> match = tryOWithAdj(word, leftWord, rightAnalyzedTokens);
    if( match != null )
      return match;

    compoundDebugLogger.logUnknownCompound(word);
    
    return null;
  }