in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala [99:250]
private def getPos(t: NCToken): String = U.getProperty(t, "pos")
private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
private def toLemmaKey(toks: Seq[NCToken]): String = toks.map(getLemma).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String = toks.map(_.getText).mkString(" ")
private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword").getOrElse(false)
/**
* Gets all sequential permutations of tokens in this NLP sentence.
* This method is like a 'tokenMix', but with all combinations of stopwords (with and without)
*
* @param tokens Tokens.
* @param maxLen Maximum number of tokens in the sequence.
*/
private[enrichers] def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
/**
* Gets all combinations for sequence of mandatory tokens with stopwords and without.
*
* Example:
* 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
* 'A, B(stop), C(stop) -> [A, B, C]; [A, B]; [A, C], [A].
*
* @param toks Tokens.
*/
def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] =
def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken): Seq[Seq[Option[NCToken]]] =
if seq.isEmpty then
if isStopWord(t) then IndexedSeq(IndexedSeq(t.?), IndexedSeq(None)) else IndexedSeq(IndexedSeq(t.?))
else
(for (subSeq <- seq) yield subSeq :+ t.?) ++ (if isStopWord(t) then for (subSeq <- seq) yield subSeq :+ None else Seq.empty)
var res: Seq[Seq[Option[NCToken]]] = Seq.empty
for (t <- toks) res = multiple(res, t)
res.map(_.flatten).filter(_.nonEmpty)
tokenMix(tokens, maxLen).
flatMap(permutations).
filter(_.nonEmpty).
distinct.
sortBy(seq => (-seq.length, seq.head.getIndex))
/**
* Gets all sequential permutations of tokens in this NLP sentence.
*
* For example, if NLP sentence contains "a, b, c, d" tokens, then
* this function will return the sequence of following token sequences in this order:
* "a b c d"
* "a b c"
* "b c d"
* "a b"
* "b c"
* "c d"
* "a"
* "b"
* "c"
* "d"
*
* NOTE: this method will not return any permutations with a quoted token.
*
* @param toks Tokens.
* @param maxLen Maximum number of tokens in the sequence.
*/
private def tokenMix(toks: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
(for (n <- toks.length until 0 by -1 if n <= maxLen) yield toks.sliding(n)).flatten
import NCEnStopWordsTokenEnricher.*
/**
* Stopword [[NCTokenEnricher token enricher]] for English (EN) language. Stopwords are the words
* which are filtered out (i.e. stopped) before processing of natural language text because they are
* insignificant.
*
* This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if the word it represents is an English stopword. The value `true` of this metadata property indicates that
* this word is detected as a stopword, `false` value indicates otherwise. This implementation works off the
* algorithm that uses an internal list of English stopwords as well as a procedural logic to determine the stopword
* status of the token. This algorithm should work fine for most of the general uses cases. User can also add
* additional stopwords or exceptions for the existing ones using corresponding parameters in [[NCEnStopWordsTokenEnricher]]
* constructor.
*
* More information about stopwords can be found at [[https://en.wikipedia.org/wiki/Stop_word]].
*
* **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that
* contain token's lemma and part of speech accordingly. You can configure [[NCOpenNLPTokenEnricher]] with the model
* for English language that would provide these metadata properties before this enricher in your [[NCPipeline pipeline]].
*
* @param addSet User defined collection of additional stopwords. These words will be stemmatized by the given `stemmer`
* before attempting to find a match. Default value is an empty set.
* @param exclSet User defined collection of exceptions, i.e. the words which should not be marked as stopwords during
* processing. These words will be stemmatized by the given `stemmer` before attempting to find a match.
* Default value is an empty set.
* @param stemmer English stemmer implementation. Default value is the instance of [[org.apache.nlpcraft.nlp.stemmer.NCEnStemmer NCEnStemmer]].
*/
class NCEnStopWordsTokenEnricher(
addSet: Set[String] = Set.empty,
exclSet: Set[String] = Set.empty,
stemmer: NCStemmer = new NCEnStemmer
) extends NCTokenEnricher with LazyLogging:
require(addSet != null, "Additional stopwords cannot be null.")
require(exclSet != null, "Exceptions stopwords cannot be null.")
require(stemmer != null, "Stemmer cannot be null.")
private var addStems: Set[String] = _
private var exclStems: Set[String] = _
private var percents: Set[String] = _
private var stopWords: StopWordHolder = _
private var exceptions: StopWordHolder = _
private var firstWords: Set[String] = _
private var nounWords: Set[String] = _
private case class TokenExtra(lemma: String, stemTxt: String, stemLemma: String)
private object TokenExtra:
def apply(t: NCToken): TokenExtra =
val lemma = getLemma(t)
new TokenExtra(lemma, getStem(t.getText), getStem(lemma))
init()
private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(getStem).mkString(" ")
/**
* stopword holder, used for hash search.
*
* @param any Any POSes container.
* @param includes Included by POS container.
* @param excludes Excluded by POS container.
*/
private case class HashHolder(
any: Set[Word],
includes: Map[String, Set[Word]],
excludes: Map[String, Set[Word]]
):
def matches(s: String, posOpt: Option[String]): Boolean =
posOpt match
case Some(pos) =>
!excludes.getOrElse(pos, Set.empty).contains(s) &&
(any.contains(s) || includes.getOrElse(pos, Set.empty).contains(s))
case _ => any.contains(s)
/**
* stopword holder, used for scanning.
*
* @param any Any POSes container.
* @param includes Included by POS container.
* @param excludes Excluded by POS container.
*/
private case class ScanHolder(
any: Set[Wildcard],
includes: Map[String, Set[Wildcard]],
excludes: Map[String, Set[Wildcard]]
):
require(!any.exists { (begin, end) => begin.isEmpty && end.isEmpty })