in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala [262:359]
def matches(s: String, posOpt: Option[String]): Boolean =
if s.contains(' ') then
false
else
posOpt match
case Some(pos) =>
!exclPoses.contains(pos) &&
!matches(s, excludes.getOrElse(pos, Set.empty)) &&
(
inclPoses.contains(pos) ||
matches(s, any) ||
matches(s, includes.getOrElse(pos, Set.empty))
)
case _ => throw new AssertionError("Unexpected POS.")
/**
* stopword data holder.
*
* @param stems Stems data holder.
* @param lemmas Lemmas data holder.
* @param origins Origins data holder.
* @param wildcardsLemmas Wildcards lemmas data holder.
* @param wildcardsOrigins Wildcards origins data holder.
*/
private case class StopWordHolder(
stems: HashHolder,
lemmas: HashHolder,
origins: HashHolder,
wildcardsLemmas: ScanHolder,
wildcardsOrigins: ScanHolder
):
def matches(toks: Seq[NCToken]): Boolean =
val posOpt = toks.size match
case 0 => throw new AssertionError(s"Unexpected empty tokens.")
case 1 => getPos(toks.head).?
case _ => None
// Hash access.
stems.matches(toStemKey(toks), posOpt) ||
lemmas.matches(toLemmaKey(toks), posOpt) ||
origins.matches(toOriginalKey(toks), posOpt) ||
// Scan access.
wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
/**
*
*/
private def init(): Unit =
addStems = addSet.map(getStem)
exclStems = exclSet.map(getStem)
def check(name: String, set: Set[String]): Unit =
if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.")
check("Additional synonyms", addStems)
check("Excluded synonyms", exclStems)
val dups = addStems.intersect(exclStems)
if dups.nonEmpty then E(s"Duplicate stems detected between additional and excluded stopwords [dups=${dups.mkString(",")}]")
percents = PERCENTS.map(getStem)
// Case sensitive.
val m = readStopWords(U.readLines(res = "stopwords/en_stop_words.txt", filterText = true, log = logger))
stopWords = m(false)
exceptions = m(true)
val gen = new NCEnStopWordGenerator(stemmer)
firstWords = gen.mkFirstWords()
nounWords = gen.mkNounWords()
/**
* Parses configuration template.
*
* @param lines Configuration file content.
* @return Holder and is-exception flag.
*/
private def readStopWords(lines: Iterator[String]): Map[Boolean, StopWordHolder] =
// 1. Prepares accumulation data structure.
enum WordForm:
case STEM, LEM, ORIG
import WordForm.*
class Condition[T]:
val any = mutable.HashSet.empty[T]
val incls = mutable.HashMap.empty[String, mutable.HashSet[T]]
val excls = mutable.HashMap.empty[String, mutable.HashSet[T]]
def addCondition(cond: T, poses: Map[String, Boolean]): Any =
if poses.isEmpty then
any += cond
else
def add(m: mutable.HashMap[String, mutable.HashSet[T]], incl: Boolean): Unit =
poses.filter { (_, isIncl) => isIncl == incl }.keys.foreach(pos =>