in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala [372:634]
def mkMap[T](mkT: Unit => T): Map[Key, T] =
val m = mutable.Map.empty[Key, T]
def add(f: WordForm, mkT: Unit => T, isExc: Boolean): Unit =
val tuple: (Key, T) = (isExc, f) -> mkT(())
m += tuple._1 -> tuple._2
WordForm.values.foreach(f =>
add(f, mkT, isExc = true)
add(f, mkT, isExc = false)
)
m.toMap
// Prepares collections.
val mHash = mkMap(_ => new Condition[Word]())
val mScan = mkMap(_ => new Condition[Wildcard]())
// 2. Accumulates data of each parsed line.
for (line <- lines)
def throwError(msg: String): Unit = E(s"Invalid stop word configuration [line=$line, reason=$msg]")
var s = line.trim
// Word with size 1 word should contains letter only.
if s.length == 1 && !s.head.isLetter then throwError("Invalid stop word")
def checkSingle(ch: Char): Unit = if s.count(_ == ch) > 1 then throwError(s"Unexpected symbols count: $ch")
// Confusing special symbols.
checkSingle('@')
checkSingle('|')
checkSingle('*')
val isExc = line.head == '~'
if isExc then s = line.drop(1)
val idxPos = s.indexOf("|")
val poses: Map[String, Boolean] =
if idxPos > 0 then
s.
drop(idxPos + 1).
trim.split(" ").
map(_.trim.toUpperCase).
filter(_.nonEmpty).
toSeq.
map(p => if p.head == '~' then p.drop(1).strip -> false else p -> true).
toMap
else
Map.empty
if !poses.keys.forall(POSES.contains) then throwError(s"Invalid POSes: ${poses.keys.mkString(", ")}")
val hasPoses = poses.nonEmpty
if hasPoses then s = s.take(idxPos).trim
val isMultiWord = s.contains(' ')
// Confusing POSes.
if poses.nonEmpty && isMultiWord then throwError("POSes cannot be defined for multiple stopword.")
var isCase = false
if s.head == '@' then
s = s.drop(1)
// Empty word.
if s.isEmpty then throwError("Empty word.")
isCase = true
val idxWild = s.indexOf("*")
if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be defined for multiple stopword.")
if idxWild < 0 then
val (word, form) =
if isCase then (s, ORIG)
else if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM)
mHash((isExc, form)).addCondition(word, poses)
else
val b = s.take(idxWild)
val e = s.drop(idxWild + 1)
if b.isEmpty && e.isEmpty && !hasPoses then throwError("Too general wildcard definition.")
mScan((isExc, if isCase then ORIG else LEM)).addCondition((b, e), poses)
// 3. Converts data to service format.
def toImmutable[T](m: mutable.HashMap[String, mutable.HashSet[T]]): Map[String, Set[T]] = m.map(p => p._1 -> p._2.toSet).toMap
Seq(true, false).map(isExc =>
def mkHolder[T, R](
m: Map[(Boolean, WordForm), Condition[T]],
form: WordForm,
mkInstance: (Set[T], Map[String, Set[T]], Map[String, Set[T]]) => R
): R =
val any = m((isExc, form)).any.toSet
val incl = toImmutable(m((isExc, form)).incls)
val excl = toImmutable(m((isExc, form)).excls)
mkInstance(any ++ excl.values.flatten, incl, excl)
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form, HashHolder.apply)
def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
).toMap
private def isVerb(pos: String): Boolean = pos.head == 'V'
/**
* Marks words before stopword.
*
* @param ns Sentence.
* @param stopPoses Stop POSes.
* @param lastIdx Last index.
* @param isException Function which return stop word exception flag.
* @param stops Stopwords tokens.
*/
@tailrec
private def markBefore(
ns: Seq[NCToken],
stopPoses: Seq[String],
lastIdx: Int,
isException: Seq[NCToken] => Boolean,
stops: mutable.HashSet[NCToken]
): Boolean =
var stop = true
for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx && !isStopWord(tok) && !isException(Seq(tok)) &&
stopPoses.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
stops += tok
stop = false
if stop then true else markBefore(ns, stopPoses, lastIdx, isException, stops)
/**
* Checks value cached or not.
*
* @param toks Tokens.
* @param cache Cache map.
* @param get Calculation method based on given tokens.
*/
private def exists(toks: Seq[NCToken], cache: mutable.HashMap[Seq[NCToken], Boolean], get: Seq[NCToken] => Boolean): Boolean =
cache.get(toks) match
case Some(b) => b
case None =>
val b = get(toks)
cache += toks -> b
b
/**
* Marks as stopwords, words with POS from configured list, which also placed before another stopword.
*/
private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra], stops: mutable.HashSet[NCToken]): Unit =
/**
* Marks as stopwords, words with POS from configured list, which also placed before another stopword.
*/
@tailrec
def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra]): Unit =
val max = ns.size - 1
var stop = true
for (
(tok, idx) <- ns.zipWithIndex; extra = extraToks(tok)
if
idx != max &&
!isStopWord(tok) &&
!exclStems.contains(extra.stemTxt) &&
!exclStems.contains(extra.stemLemma) &&
POSES.contains(getPos(tok)) &&
isStopWord(ns(idx + 1))
)
stops += tok
stop = false
if !stop then processCommonStops0(ns, extraToks)
processCommonStops0(ns, extraToks)
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
// stopword and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]
def isStop(toks: Seq[NCToken]): Boolean = exists(toks, cacheSw, stopWords.matches)
def isException(toks: Seq[NCToken]): Boolean = exists(toks, cacheEx, exceptions.matches)
val stops = mutable.HashSet.empty[NCToken]
val extraToks =
scala.collection.mutable.LinkedHashMap.empty[NCToken, TokenExtra] ++=
toks.map(t => t -> TokenExtra(t))
for ((tok, extra) <- extraToks)
val idx = tok.getIndex
val pos = getPos(tok)
val lemma = extra.lemma
val st = extra.stemTxt
def isFirst: Boolean = idx == 0
def isLast: Boolean = idx == toks.length - 1
def next(): NCToken = toks(idx + 1)
def prev(): NCToken = toks(idx - 1)
def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
isVerb(pos) && lemma == secondVerb ||
(isVerb(pos) && lemma == firstVerb && !isLast && isVerb(getPos(next())) && getLemma(next()) == secondVerb)
// +---------------------------------+
// | Pass #1. |
// | POS tags and manual resolution. |
// +---------------------------------+
val stop = !isException(Seq(tok)) &&
(// Percents after numbers.
// 1. Word from 'percentage' list.
percents.contains(st) &&
// 2. Number before.
!isFirst && getPos(prev()) == "CD" &&
// 3. It's last word or any words after except numbers.
(isLast || getPos(next()) != "CD")
) ||
// be, was, is etc. or has been etc.
isCommonVerbs("have", "be") ||
// be, was, is etc. or have done etc.
isCommonVerbs("have", "do")
if stop then stops += tok
// +--------------------------------------+
// | Pass #2. |
// | Find all words from predefined list. |
// +--------------------------------------+
val buf = mutable.Buffer.empty[Seq[NCToken]]
val mix = tokenMixWithStopWords(toks)
for (toks <- mix if !buf.exists(_.containsSlice(toks)) && isStop(toks) && !isException(toks))
toks.foreach(tok => stops += tok)
buf += toks
// Capture the token mix at this point minus the initial stopword found up to this point.
val origToks: Seq[(Seq[NCToken], String)] =
(for (toks <- mix) yield toks.toSeq).map(s => s -> toStemKey(s)).toSeq
// +--------------------------------------------------+
// | Pass #3. |
// | Check for sentence beginners from external file. |
// +--------------------------------------------------+
val foundKeys = new mutable.HashSet[String]()
// All sentence first stopword + first non stop word.
val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => !isStopWord(p)).map(p => p)
for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
foundKeys += key
// +-------------------------------------------------+
// | Pass #4. |
// | Check for sentence beginners with ending nouns. |
// +-------------------------------------------------+
for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
foundKeys.find(key.startsWith) match
case Some(s) => if nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok)
case None => ()
// +-------------------------------------------------+
// | Pass #5. |
// | Mark words with POSes before stopwords. |
// +-------------------------------------------------+
markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
// +-------------------------------------------------+
// | Pass #6. |
// | Processing additional and excluded stopword. |
// +-------------------------------------------------+
def has(set: Set[String], extra: TokenExtra) = set.contains(extra.stemTxt) || set.contains(extra.stemLemma)
for ((t, extra) <- extraToks if has(addStems, extra)) stops += t
for ((t, _) <- stops.map(t => t -> extraToks(t)).filter { (_, extra) => has(exclSet, extra)}) stops -= t