in extensions/spellcheck/hunspell/src/affixmgr.cxx [2179:2709]
int AffixMgr::compound_check_morph(const char* word,
int len,
short wordnum,
short numsyllable,
short maxwordnum,
short wnum,
hentry** words,
hentry** rwords,
char hu_mov_rule,
std::string& result,
const std::string* partresult) {
int i;
short oldnumsyllable, oldnumsyllable2, oldwordnum, oldwordnum2;
int ok = 0;
struct hentry* rv = NULL;
struct hentry* rv_first;
std::string st;
char ch;
int checked_prefix;
std::string presult;
int cmin;
int cmax;
char affixed = 0;
hentry** oldwords = words;
// add a time limit to handle possible
// combinatorical explosion of the overlapping words
HUNSPELL_THREAD_LOCAL clock_t timelimit;
if (wordnum == 0) {
// get the start time, seeing as we're reusing this set to 0
// to flag timeout, use clock() + 1 to avoid start clock()
// of 0 as being a timeout
timelimit = clock() + 1;
}
else if (timelimit != 0 && (clock() > timelimit + TIMELIMIT)) {
timelimit = 0;
}
setcminmax(&cmin, &cmax, word, len);
st.assign(word);
for (i = cmin; i < cmax; i++) {
// go to end of the UTF-8 character
if (utf8) {
for (; (st[i] & 0xc0) == 0x80; i++)
;
if (i >= cmax)
return 0;
}
words = oldwords;
int onlycpdrule = (words) ? 1 : 0;
do { // onlycpdrule loop
if (timelimit == 0)
return 0;
oldnumsyllable = numsyllable;
oldwordnum = wordnum;
checked_prefix = 0;
ch = st[i];
st[i] = '\0';
sfx = NULL;
// FIRST WORD
affixed = 1;
presult.clear();
if (partresult)
presult.append(*partresult);
rv = lookup(st.c_str()); // perhaps without prefix
// forbid dictionary stems with COMPOUNDFORBIDFLAG in
// compound words, overriding the effect of COMPOUNDPERMITFLAG
if ((rv) && compoundforbidflag &&
TESTAFF(rv->astr, compoundforbidflag, rv->alen) && !hu_mov_rule)
continue;
// search homonym with compound flag
while ((rv) && !hu_mov_rule &&
((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
!((compoundflag && !words && !onlycpdrule &&
TESTAFF(rv->astr, compoundflag, rv->alen)) ||
(compoundbegin && !wordnum && !onlycpdrule &&
TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
(compoundmiddle && wordnum && !words && !onlycpdrule &&
TESTAFF(rv->astr, compoundmiddle, rv->alen)) ||
(!defcpdtable.empty() && onlycpdrule &&
((!words && !wordnum &&
defcpd_check(&words, wnum, rv, rwords, 0)) ||
(words &&
defcpd_check(&words, wnum, rv, rwords, 0))))))) {
rv = rv->next_homonym;
}
if (timelimit == 0)
return 0;
if (rv)
affixed = 0;
if (rv) {
presult.push_back(MSEP_FLD);
presult.append(MORPH_PART);
presult.append(st.c_str());
if (!HENTRY_FIND(rv, MORPH_STEM)) {
presult.push_back(MSEP_FLD);
presult.append(MORPH_STEM);
presult.append(st.c_str());
}
if (HENTRY_DATA(rv)) {
presult.push_back(MSEP_FLD);
presult.append(HENTRY_DATA2(rv));
}
}
if (!rv) {
if (compoundflag &&
!(rv =
prefix_check(st.c_str(), i, hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundflag))) {
if (((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
compoundflag,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(st.c_str(), i, 0, NULL, compoundflag)))) &&
!hu_mov_rule && sfx->getCont() &&
((compoundforbidflag &&
TESTAFF(sfx->getCont(), compoundforbidflag,
sfx->getContLen())) ||
(compoundend &&
TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
rv = NULL;
}
}
if (rv ||
(((wordnum == 0) && compoundbegin &&
((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
compoundbegin,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
st.c_str(), i, 0, NULL,
compoundbegin))) || // twofold suffix+compound
(rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundbegin)))) ||
((wordnum > 0) && compoundmiddle &&
((rv = suffix_check(st.c_str(), i, 0, NULL, FLAG_NULL,
compoundmiddle,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN)) ||
(compoundmoresuffixes &&
(rv = suffix_check_twosfx(
st.c_str(), i, 0, NULL,
compoundmiddle))) || // twofold suffix+compound
(rv = prefix_check(st.c_str(), i,
hu_mov_rule ? IN_CPD_OTHER : IN_CPD_BEGIN,
compoundmiddle)))))) {
std::string p;
if (compoundflag)
p = affix_check_morph(st.c_str(), i, compoundflag);
if (p.empty()) {
if ((wordnum == 0) && compoundbegin) {
p = affix_check_morph(st.c_str(), i, compoundbegin);
} else if ((wordnum > 0) && compoundmiddle) {
p = affix_check_morph(st.c_str(), i, compoundmiddle);
}
}
if (!p.empty()) {
presult.push_back(MSEP_FLD);
presult.append(MORPH_PART);
presult.append(st.c_str());
line_uniq_app(p, MSEP_REC);
presult.append(p);
}
checked_prefix = 1;
}
// else check forbiddenwords
} else if (rv->astr && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen) ||
TESTAFF(rv->astr, needaffix, rv->alen))) {
st[i] = ch;
continue;
}
// check non_compound flag in suffix and prefix
if ((rv) && !hu_mov_rule &&
((pfx && pfx->getCont() &&
TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
(sfx && sfx->getCont() &&
TESTAFF(sfx->getCont(), compoundforbidflag, sfx->getContLen())))) {
continue;
}
// check compoundend flag in suffix and prefix
if ((rv) && !checked_prefix && compoundend && !hu_mov_rule &&
((pfx && pfx->getCont() &&
TESTAFF(pfx->getCont(), compoundend, pfx->getContLen())) ||
(sfx && sfx->getCont() &&
TESTAFF(sfx->getCont(), compoundend, sfx->getContLen())))) {
continue;
}
// check compoundmiddle flag in suffix and prefix
if ((rv) && !checked_prefix && (wordnum == 0) && compoundmiddle &&
!hu_mov_rule &&
((pfx && pfx->getCont() &&
TESTAFF(pfx->getCont(), compoundmiddle, pfx->getContLen())) ||
(sfx && sfx->getCont() &&
TESTAFF(sfx->getCont(), compoundmiddle, sfx->getContLen())))) {
rv = NULL;
}
// check forbiddenwords
if ((rv) && (rv->astr) && (TESTAFF(rv->astr, forbiddenword, rv->alen) ||
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)))
continue;
// increment word number, if the second root has a compoundroot flag
if ((rv) && (compoundroot) &&
(TESTAFF(rv->astr, compoundroot, rv->alen))) {
wordnum++;
}
// first word is acceptable in compound words?
if (((rv) &&
(checked_prefix || (words && words[wnum]) ||
(compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
((oldwordnum == 0) && compoundbegin &&
TESTAFF(rv->astr, compoundbegin, rv->alen)) ||
((oldwordnum > 0) && compoundmiddle &&
TESTAFF(rv->astr, compoundmiddle, rv->alen))
// LANG_hu section: spec. Hungarian rule
|| ((langnum == LANG_hu) && // hu_mov_rule
hu_mov_rule && (TESTAFF(rv->astr, 'F', rv->alen) ||
TESTAFF(rv->astr, 'G', rv->alen) ||
TESTAFF(rv->astr, 'H', rv->alen)))
// END of LANG_hu section
) &&
!((checkcompoundtriple && !words && // test triple letters
(word[i - 1] == word[i]) &&
(((i > 1) && (word[i - 1] == word[i - 2])) ||
((word[i - 1] == word[i + 1])) // may be word[i+1] == '\0'
)) ||
(
// test CHECKCOMPOUNDPATTERN
!checkcpdtable.empty() && !words &&
cpdpat_check(word, i, rv, NULL, affixed)) ||
(checkcompoundcase && !words && cpdcase_check(word, i))))
// LANG_hu section: spec. Hungarian rule
||
((!rv) && (langnum == LANG_hu) && hu_mov_rule &&
(rv = affix_check(st.c_str(), i)) &&
(sfx && sfx->getCont() &&
(TESTAFF(sfx->getCont(), (unsigned short)'x', sfx->getContLen()) ||
TESTAFF(sfx->getCont(), (unsigned short)'%', sfx->getContLen()))))
// END of LANG_hu section
) {
// LANG_hu section: spec. Hungarian rule
if (langnum == LANG_hu) {
// calculate syllable number of the word
numsyllable += get_syllable(st.substr(0, i));
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
}
// END of LANG_hu section
// NEXT WORD(S)
rv_first = rv;
rv = lookup((word + i)); // perhaps without prefix
// search homonym with compound flag
while ((rv) && ((needaffix && TESTAFF(rv->astr, needaffix, rv->alen)) ||
!((compoundflag && !words &&
TESTAFF(rv->astr, compoundflag, rv->alen)) ||
(compoundend && !words &&
TESTAFF(rv->astr, compoundend, rv->alen)) ||
(!defcpdtable.empty() && words &&
defcpd_check(&words, wnum + 1, rv, NULL, 1))))) {
rv = rv->next_homonym;
}
if (rv && words && words[wnum + 1]) {
result.append(presult);
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
if (complexprefixes && HENTRY_DATA(rv))
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
// store the pointer of the hash entry
if (!complexprefixes && HENTRY_DATA(rv)) {
result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
result.push_back(MSEP_REC);
return 0;
}
oldnumsyllable2 = numsyllable;
oldwordnum2 = wordnum;
// LANG_hu section: spec. Hungarian rule
if ((rv) && (langnum == LANG_hu) &&
(TESTAFF(rv->astr, 'I', rv->alen)) &&
!(TESTAFF(rv->astr, 'J', rv->alen))) {
numsyllable--;
}
// END of LANG_hu section
// increment word number, if the second root has a compoundroot flag
if ((rv) && (compoundroot) &&
(TESTAFF(rv->astr, compoundroot, rv->alen))) {
wordnum++;
}
// check forbiddenwords
if ((rv) && (rv->astr) &&
(TESTAFF(rv->astr, forbiddenword, rv->alen) ||
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen))) {
st[i] = ch;
continue;
}
// second word is acceptable, as a root?
// hungarian conventions: compounding is acceptable,
// when compound forms consist of 2 words, or if more,
// then the syllable number of root words must be 6, or lesser.
if ((rv) &&
((compoundflag && TESTAFF(rv->astr, compoundflag, rv->alen)) ||
(compoundend && TESTAFF(rv->astr, compoundend, rv->alen))) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) &&
(numsyllable + get_syllable(std::string(HENTRY_WORD(rv), rv->blen)) <=
cpdmaxsyllable))) &&
((!checkcompounddup || (rv != rv_first)))) {
// bad compound word
result.append(presult);
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
if (HENTRY_DATA(rv)) {
if (complexprefixes)
result.append(HENTRY_DATA2(rv));
if (!HENTRY_FIND(rv, MORPH_STEM)) {
result.push_back(MSEP_FLD);
result.append(MORPH_STEM);
result.append(HENTRY_WORD(rv));
}
// store the pointer of the hash entry
if (!complexprefixes) {
result.push_back(MSEP_FLD);
result.append(HENTRY_DATA2(rv));
}
}
result.push_back(MSEP_REC);
ok = 1;
}
numsyllable = oldnumsyllable2;
wordnum = oldwordnum2;
// perhaps second word has prefix or/and suffix
sfx = NULL;
sfxflag = FLAG_NULL;
if (compoundflag && !onlycpdrule)
rv = affix_check((word + i), strlen(word + i), compoundflag);
else
rv = NULL;
if (!rv && compoundend && !onlycpdrule) {
sfx = NULL;
pfx = NULL;
rv = affix_check((word + i), strlen(word + i), compoundend);
}
if (!rv && !defcpdtable.empty() && words) {
rv = affix_check((word + i), strlen(word + i), 0, IN_CPD_END);
if (rv && words && defcpd_check(&words, wnum + 1, rv, NULL, 1)) {
std::string m;
if (compoundflag)
m = affix_check_morph((word + i), strlen(word + i), compoundflag);
if (m.empty() && compoundend) {
m = affix_check_morph((word + i), strlen(word + i), compoundend);
}
result.append(presult);
if (!m.empty()) {
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
line_uniq_app(m, MSEP_REC);
result.append(m);
}
result.push_back(MSEP_REC);
ok = 1;
}
}
// check non_compound flag in suffix and prefix
if ((rv) &&
((pfx && pfx->getCont() &&
TESTAFF(pfx->getCont(), compoundforbidflag, pfx->getContLen())) ||
(sfx && sfx->getCont() &&
TESTAFF(sfx->getCont(), compoundforbidflag,
sfx->getContLen())))) {
rv = NULL;
}
// check forbiddenwords
if ((rv) && (rv->astr) &&
(TESTAFF(rv->astr, forbiddenword, rv->alen) ||
TESTAFF(rv->astr, ONLYUPCASEFLAG, rv->alen)) &&
(!TESTAFF(rv->astr, needaffix, rv->alen))) {
st[i] = ch;
continue;
}
if (langnum == LANG_hu) {
// calculate syllable number of the word
numsyllable += get_syllable(word + i);
// - affix syllable num.
// XXX only second suffix (inflections, not derivations)
if (sfxappnd) {
std::string tmp(sfxappnd);
reverseword(tmp);
numsyllable -= short(get_syllable(tmp) + sfxextra);
} else {
numsyllable -= short(sfxextra);
}
// + 1 word, if syllable number of the prefix > 1 (hungarian
// convention)
if (pfx && (get_syllable(pfx->getKey()) > 1))
wordnum++;
// increment syllable num, if last word has a SYLLABLENUM flag
// and the suffix is beginning `s'
if (!cpdsyllablenum.empty()) {
switch (sfxflag) {
case 'c': {
numsyllable += 2;
break;
}
case 'J': {
numsyllable += 1;
break;
}
case 'I': {
if (rv && TESTAFF(rv->astr, 'J', rv->alen))
numsyllable += 1;
break;
}
}
}
}
// increment word number, if the second word has a compoundroot flag
if ((rv) && (compoundroot) &&
(TESTAFF(rv->astr, compoundroot, rv->alen))) {
wordnum++;
}
// second word is acceptable, as a word with prefix or/and suffix?
// hungarian conventions: compounding is acceptable,
// when compound forms consist 2 word, otherwise
// the syllable number of root words is 6, or lesser.
if ((rv) &&
(((cpdwordmax == -1) || (wordnum + 1 < cpdwordmax)) ||
((cpdmaxsyllable != 0) && (numsyllable <= cpdmaxsyllable))) &&
((!checkcompounddup || (rv != rv_first)))) {
std::string m;
if (compoundflag)
m = affix_check_morph((word + i), strlen(word + i), compoundflag);
if (m.empty() && compoundend) {
m = affix_check_morph((word + i), strlen(word + i), compoundend);
}
result.append(presult);
if (!m.empty()) {
result.push_back(MSEP_FLD);
result.append(MORPH_PART);
result.append(word + i);
line_uniq_app(m, MSEP_REC);
result.push_back(MSEP_FLD);
result.append(m);
}
result.push_back(MSEP_REC);
ok = 1;
}
numsyllable = oldnumsyllable2;
wordnum = oldwordnum2;
// perhaps second word is a compound word (recursive call)
if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
compound_check_morph((word + i), strlen(word + i), wordnum + 1,
numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
result, &presult);
} else {
rv = NULL;
}
}
st[i] = ch;
wordnum = oldwordnum;
numsyllable = oldnumsyllable;
} while (!defcpdtable.empty() && oldwordnum == 0 &&
onlycpdrule++ < 1); // end of onlycpd loop
}
return 0;
}