int AffixMgr::compound_check_morph()

in extensions/spellcheck/hunspell/src/affixmgr.cxx [2179:2709]


int AffixMgr::compound_check_morph(const char* word,
                                   int len,
                                   short wordnum,
                                   short numsyllable,
                                   short maxwordnum,
                                   short wnum,
                                   hentry** words,
                                   hentry** rwords,
                                   char hu_mov_rule,
                                   std::string& result,
                                   const std::string* partresult) {
  int i;
  short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
  int ok = 0;

  struct hentry* rv = NULL;
  struct hentry* rv_first;
  std::string st;
  char ch;

  int checked_prefix;
  std::string presult;

  int cmin;
  int cmax;

  char affixed = 0;
  hentry** oldwords = words;

  // add a time limit to handle possible
  // combinatorical explosion of the overlapping words

  HUNSPELL_THREAD_LOCAL clock_t timelimit;

  if (wordnum == 0) {
      // get the start time, seeing as we're reusing this set to 0
      // to flag timeout, use clock() + 1 to avoid start clock()
      // of 0 as being a timeout
      timelimit = clock() + 1;
  }
  else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
      timelimit = 0;
  }

  setcminmax(&cmin, &cmax, word, len);

  st.assign(word);

  for (i = cmin; i < cmax; i++) {
    // go to end of the UTF-8 character
    if (utf8) {
      for (; (st[i] & 0xc0) == 0x80; i++)
        ;
      if (i >= cmax)
        return 0;
    }

    words = oldwords;
    int onlycpdrule = (words) ? 1 : 0;

    do {  // onlycpdrule loop

      if (timelimit == 0)
        return 0;

      oldnumsyllable = numsyllable;
      oldwordnum = wordnum;
      checked_prefix = 0;

      ch = st[i];
      st[i] = '\0';
      sfx = NULL;

      // FIRST WORD

      affixed = 1;

      presult.clear();
      if (partresult)
        presult.append(*partresult);

      rv = lookup(st.c_str());  // perhaps without prefix

      // forbid dictionary stems with COMPOUNDFORBIDFLAG in
      // compound words, overriding the effect of COMPOUNDPERMITFLAG
      if ((rv) && compoundforbidflag &&
              TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
          continue;

      // search homonym with compound flag
      while ((rv) && !hu_mov_rule &&
             ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
              !((compoundflag && !words && !onlycpdrule &&
                 TESTAFF(rv->astr, compoundflag, rv->alen)) ||
                (compoundbegin && !wordnum && !onlycpdrule &&
                 TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
                (compoundmiddle && wordnum && !words && !onlycpdrule &&
                 TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
                (!defcpdtable.empty() && onlycpdrule &&
                 ((!words && !wordnum &&
                   defcpd_check(&words, wnum, rv, rwords, 0)) ||
                  (words &&
                   defcpd_check(&words, wnum, rv, rwords, 0))))))) {
        rv = rv->next_homonym;
      }

      if (timelimit == 0)
        return 0;

      if (rv)
        affixed = 0;

      if (rv) {
        presult.push_back(MSEP_FLD);
        presult.append(MORPH_PART);
        presult.append(st.c_str());
        if (!HENTRY_FIND(rv, MORPH_STEM)) {
          presult.push_back(MSEP_FLD);
          presult.append(MORPH_STEM);
          presult.append(st.c_str());
        }
        if (HENTRY_DATA(rv)) {
          presult.push_back(MSEP_FLD);
          presult.append(HENTRY_DATA2(rv));
        }
      }

      if (!rv) {
        if (compoundflag &&
            !(rv =
                  prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
                               compoundflag))) {
          if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
                                  compoundflag,
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
               (compoundmoresuffixes &&
                (rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
              !hu_mov_rule && sfx->getCont() &&
              ((compoundforbidflag &&
                TESTAFF(sfx->getCont(), compoundforbidflag,
                        sfx->getContLen())) ||
               (compoundend &&
                TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
            rv = NULL;
          }
        }

        if (rv ||
            (((wordnum == 0) && compoundbegin &&
              ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
                                  compoundbegin,
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
               (compoundmoresuffixes &&
                (rv = suffix_check_twosfx(
                     st.c_str(), i, 0, NULL,
                     compoundbegin))) ||  // twofold suffix+compound
               (rv = prefix_check(st.c_str(), i,
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
                                  compoundbegin)))) ||
             ((wordnum > 0) && compoundmiddle &&
              ((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
                                  compoundmiddle,
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
               (compoundmoresuffixes &&
                (rv = suffix_check_twosfx(
                     st.c_str(), i, 0, NULL,
                     compoundmiddle))) ||  // twofold suffix+compound
               (rv = prefix_check(st.c_str(), i,
                                  hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
                                  compoundmiddle)))))) {
          std::string p;
          if (compoundflag)
            p = affix_check_morph(st.c_str(), i, compoundflag);
          if (p.empty()) {
            if ((wordnum == 0) && compoundbegin) {
              p = affix_check_morph(st.c_str(), i, compoundbegin);
            } else if ((wordnum > 0) && compoundmiddle) {
              p = affix_check_morph(st.c_str(), i, compoundmiddle);
            }
          }
          if (!p.empty()) {
            presult.push_back(MSEP_FLD);
            presult.append(MORPH_PART);
            presult.append(st.c_str());
            line_uniq_app(p, MSEP_REC);
            presult.append(p);
          }
          checked_prefix = 1;
        }
        // else check forbiddenwords
      } else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
                              TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
                              TESTAFF(rv->astr, needaffix, rv->alen))) {
        st[i] = ch;
        continue;
      }

      // check non_compound flag in suffix and prefix
      if ((rv) && !hu_mov_rule &&
          ((pfx && pfx->getCont() &&
            TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
           (sfx && sfx->getCont() &&
            TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
        continue;
      }

      // check compoundend flag in suffix and prefix
      if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
          ((pfx && pfx->getCont() &&
            TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
           (sfx && sfx->getCont() &&
            TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
        continue;
      }

      // check compoundmiddle flag in suffix and prefix
      if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
          !hu_mov_rule &&
          ((pfx && pfx->getCont() &&
            TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
           (sfx && sfx->getCont() &&
            TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
        rv = NULL;
      }

      // check forbiddenwords
      if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
                                 TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
        continue;

      // increment word number, if the second root has a compoundroot flag
      if ((rv) && (compoundroot) &&
          (TESTAFF(rv->astr, compoundroot, rv->alen))) {
        wordnum++;
      }

      // first word is acceptable in compound words?
      if (((rv) &&
           (checked_prefix || (words && words[wnum]) ||
            (compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
            ((oldwordnum == 0) && compoundbegin &&
             TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
            ((oldwordnum > 0) && compoundmiddle &&
             TESTAFF(rv->astr, compoundmiddle, rv->alen))
            // LANG_hu section: spec. Hungarian rule
            || ((langnum == LANG_hu) &&  // hu_mov_rule
                hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
                                TESTAFF(rv->astr, 'G', rv->alen) ||
                                TESTAFF(rv->astr, 'H', rv->alen)))
            // END of LANG_hu section
            ) &&
           !((checkcompoundtriple && !words &&  // test triple letters
              (word[i - 1] == word[i]) &&
              (((i > 1) && (word[i - 1] == word[i - 2])) ||
               ((word[i - 1] == word[i + 1]))  // may be word[i+1] == '\0'
               )) ||
             (
                 // test CHECKCOMPOUNDPATTERN
                 !checkcpdtable.empty() && !words &&
                 cpdpat_check(word, i, rv, NULL, affixed)) ||
             (checkcompoundcase && !words && cpdcase_check(word, i))))
          // LANG_hu section: spec. Hungarian rule
          ||
          ((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
           (rv = affix_check(st.c_str(), i)) &&
           (sfx && sfx->getCont() &&
            (TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
             TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
          // END of LANG_hu section
          ) {
        // LANG_hu section: spec. Hungarian rule
        if (langnum == LANG_hu) {
          // calculate syllable number of the word
          numsyllable += get_syllable(st.substr(0, i));

          // + 1 word, if syllable number of the prefix > 1 (hungarian
          // convention)
          if (pfx && (get_syllable(pfx->getKey()) > 1))
            wordnum++;
        }
        // END of LANG_hu section

        // NEXT WORD(S)
        rv_first = rv;
        rv = lookup((word + i));  // perhaps without prefix

        // search homonym with compound flag
        while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
                        !((compoundflag && !words &&
                           TESTAFF(rv->astr, compoundflag, rv->alen)) ||
                          (compoundend && !words &&
                           TESTAFF(rv->astr, compoundend, rv->alen)) ||
                          (!defcpdtable.empty() && words &&
                           defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
          rv = rv->next_homonym;
        }

        if (rv && words && words[wnum + 1]) {
          result.append(presult);
          result.push_back(MSEP_FLD);
          result.append(MORPH_PART);
          result.append(word + i);
          if (complexprefixes && HENTRY_DATA(rv))
            result.append(HENTRY_DATA2(rv));
          if (!HENTRY_FIND(rv, MORPH_STEM)) {
            result.push_back(MSEP_FLD);
            result.append(MORPH_STEM);
            result.append(HENTRY_WORD(rv));
          }
          // store the pointer of the hash entry
          if (!complexprefixes && HENTRY_DATA(rv)) {
            result.push_back(MSEP_FLD);
            result.append(HENTRY_DATA2(rv));
          }
          result.push_back(MSEP_REC);
          return 0;
        }

        oldnumsyllable2 = numsyllable;
        oldwordnum2 = wordnum;

        // LANG_hu section: spec. Hungarian rule
        if ((rv) && (langnum == LANG_hu) &&
            (TESTAFF(rv->astr, 'I', rv->alen)) &&
            !(TESTAFF(rv->astr, 'J', rv->alen))) {
          numsyllable--;
        }
        // END of LANG_hu section
        // increment word number, if the second root has a compoundroot flag
        if ((rv) && (compoundroot) &&
            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
          wordnum++;
        }

        // check forbiddenwords
        if ((rv) && (rv->astr) &&
            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
          st[i] = ch;
          continue;
        }

        // second word is acceptable, as a root?
        // hungarian conventions: compounding is acceptable,
        // when compound forms consist of 2 words, or if more,
        // then the syllable number of root words must be 6, or lesser.
        if ((rv) &&
            ((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
             (compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
             ((cpdmaxsyllable != 0) &&
              (numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
               cpdmaxsyllable))) &&
            ((!checkcompounddup || (rv != rv_first)))) {
          // bad compound word
          result.append(presult);
          result.push_back(MSEP_FLD);
          result.append(MORPH_PART);
          result.append(word + i);

          if (HENTRY_DATA(rv)) {
            if (complexprefixes)
              result.append(HENTRY_DATA2(rv));
            if (!HENTRY_FIND(rv, MORPH_STEM)) {
              result.push_back(MSEP_FLD);
              result.append(MORPH_STEM);
              result.append(HENTRY_WORD(rv));
            }
            // store the pointer of the hash entry
            if (!complexprefixes) {
              result.push_back(MSEP_FLD);
              result.append(HENTRY_DATA2(rv));
            }
          }
          result.push_back(MSEP_REC);
          ok = 1;
        }

        numsyllable = oldnumsyllable2;
        wordnum = oldwordnum2;

        // perhaps second word has prefix or/and suffix
        sfx = NULL;
        sfxflag = FLAG_NULL;

        if (compoundflag && !onlycpdrule)
          rv = affix_check((word + i), strlen(word + i), compoundflag);
        else
          rv = NULL;

        if (!rv && compoundend && !onlycpdrule) {
          sfx = NULL;
          pfx = NULL;
          rv = affix_check((word + i), strlen(word + i), compoundend);
        }

        if (!rv && !defcpdtable.empty() && words) {
          rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
          if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
            std::string m;
            if (compoundflag)
              m = affix_check_morph((word + i), strlen(word + i), compoundflag);
            if (m.empty() && compoundend) {
              m = affix_check_morph((word + i), strlen(word + i), compoundend);
            }
            result.append(presult);
            if (!m.empty()) {
              result.push_back(MSEP_FLD);
              result.append(MORPH_PART);
              result.append(word + i);
              line_uniq_app(m, MSEP_REC);
              result.append(m);
            }
            result.push_back(MSEP_REC);
            ok = 1;
          }
        }

        // check non_compound flag in suffix and prefix
        if ((rv) &&
            ((pfx && pfx->getCont() &&
              TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
             (sfx && sfx->getCont() &&
              TESTAFF(sfx->getCont(), compoundforbidflag,
                      sfx->getContLen())))) {
          rv = NULL;
        }

        // check forbiddenwords
        if ((rv) && (rv->astr) &&
            (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
             TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
            (!TESTAFF(rv->astr, needaffix, rv->alen))) {
          st[i] = ch;
          continue;
        }

        if (langnum == LANG_hu) {
          // calculate syllable number of the word
          numsyllable += get_syllable(word + i);

          // - affix syllable num.
          // XXX only second suffix (inflections, not derivations)
          if (sfxappnd) {
            std::string tmp(sfxappnd);
            reverseword(tmp);
            numsyllable -= short(get_syllable(tmp) + sfxextra);
          } else {
            numsyllable -= short(sfxextra);
          }

          // + 1 word, if syllable number of the prefix > 1 (hungarian
          // convention)
          if (pfx && (get_syllable(pfx->getKey()) > 1))
            wordnum++;

          // increment syllable num, if last word has a SYLLABLENUM flag
          // and the suffix is beginning `s'

          if (!cpdsyllablenum.empty()) {
            switch (sfxflag) {
              case 'c': {
                numsyllable += 2;
                break;
              }
              case 'J': {
                numsyllable += 1;
                break;
              }
              case 'I': {
                if (rv && TESTAFF(rv->astr, 'J', rv->alen))
                  numsyllable += 1;
                break;
              }
            }
          }
        }

        // increment word number, if the second word has a compoundroot flag
        if ((rv) && (compoundroot) &&
            (TESTAFF(rv->astr, compoundroot, rv->alen))) {
          wordnum++;
        }
        // second word is acceptable, as a word with prefix or/and suffix?
        // hungarian conventions: compounding is acceptable,
        // when compound forms consist 2 word, otherwise
        // the syllable number of root words is 6, or lesser.
        if ((rv) &&
            (((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
             ((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
            ((!checkcompounddup || (rv != rv_first)))) {
          std::string m;
          if (compoundflag)
            m = affix_check_morph((word + i), strlen(word + i), compoundflag);
          if (m.empty() && compoundend) {
            m = affix_check_morph((word + i), strlen(word + i), compoundend);
          }
          result.append(presult);
          if (!m.empty()) {
            result.push_back(MSEP_FLD);
            result.append(MORPH_PART);
            result.append(word + i);
            line_uniq_app(m, MSEP_REC);
            result.push_back(MSEP_FLD);
            result.append(m);
          }
          result.push_back(MSEP_REC);
          ok = 1;
        }

        numsyllable = oldnumsyllable2;
        wordnum = oldwordnum2;

        // perhaps second word is a compound word (recursive call)
        if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
          compound_check_morph((word + i), strlen(word + i), wordnum + 1,
                               numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
                               result, &presult);
        } else {
          rv = NULL;
        }
      }
      st[i] = ch;
      wordnum = oldwordnum;
      numsyllable = oldnumsyllable;

    } while (!defcpdtable.empty() && oldwordnum == 0 &&
             onlycpdrule++ < 1);  // end of onlycpd loop
  }
  return 0;
}