private def getPos()

in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala [99:250]


    private def getPos(t: NCToken): String = U.getProperty(t, "pos")
    private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
    private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
    private def toLemmaKey(toks: Seq[NCToken]): String = toks.map(getLemma).mkString(" ")
    private def toOriginalKey(toks: Seq[NCToken]): String = toks.map(_.getText).mkString(" ")
    private def isStopWord(t: NCToken): Boolean = t.get[Boolean]("stopword").getOrElse(false)

    /**
      * Gets all sequential permutations of tokens in this NLP sentence.
      * This method is like a 'tokenMix', but with all combinations of stopwords (with and without)
      *
      * @param tokens Tokens.
      * @param maxLen Maximum number of tokens in the sequence.
      */
    private[enrichers] def tokenMixWithStopWords(tokens: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
        /**
          * Gets all combinations for sequence of mandatory tokens with stopwords and without.
          *
          * Example:
          * 'A (stop), B, C(stop) -> [A, B, C]; [A, B]; [B, C], [B]
          * 'A, B(stop), C(stop) -> [A, B, C]; [A, B]; [A, C], [A].
          *
          * @param toks Tokens.
          */
        def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] =
            def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken): Seq[Seq[Option[NCToken]]] =
                if seq.isEmpty then
                    if isStopWord(t) then IndexedSeq(IndexedSeq(t.?), IndexedSeq(None)) else IndexedSeq(IndexedSeq(t.?))
                else
                    (for (subSeq <- seq) yield subSeq :+ t.?) ++ (if isStopWord(t) then for (subSeq <- seq) yield subSeq :+ None else Seq.empty)

            var res: Seq[Seq[Option[NCToken]]] = Seq.empty
            for (t <- toks) res = multiple(res, t)
            res.map(_.flatten).filter(_.nonEmpty)

        tokenMix(tokens, maxLen).
            flatMap(permutations).
            filter(_.nonEmpty).
            distinct.
            sortBy(seq => (-seq.length, seq.head.getIndex))

    /**
      * Gets all sequential permutations of tokens in this NLP sentence.
      *
      * For example, if NLP sentence contains "a, b, c, d" tokens, then
      * this function will return the sequence of following token sequences in this order:
      * "a b c d"
      * "a b c"
      * "b c d"
      * "a b"
      * "b c"
      * "c d"
      * "a"
      * "b"
      * "c"
      * "d"
      *
      * NOTE: this method will not return any permutations with a quoted token.
      *
      * @param toks Tokens.
      * @param maxLen Maximum number of tokens in the sequence.
      */
    private def tokenMix(toks: Seq[NCToken], maxLen: Int = Integer.MAX_VALUE): Seq[Seq[NCToken]] =
        (for (n <- toks.length until 0 by -1 if n <= maxLen) yield toks.sliding(n)).flatten

import NCEnStopWordsTokenEnricher.*

/**
  * Stopword [[NCTokenEnricher token enricher]] for English (EN) language. Stopwords are the words
  * which are filtered out (i.e. stopped) before processing of natural language text because they are
  * insignificant.
  *
  * This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
  * instance if the word it represents is an English stopword. The value `true` of this metadata property indicates that
  * this word is detected as a stopword, `false` value indicates otherwise. This implementation works off the
  * algorithm that uses an internal list of English stopwords as well as a procedural logic to determine the stopword
  * status of the token. This algorithm should work fine for most of the general uses cases. User can also add
  * additional stopwords or exceptions for the existing ones using corresponding parameters in [[NCEnStopWordsTokenEnricher]]
  * constructor.
  *
  * More information about stopwords can be found at [[https://en.wikipedia.org/wiki/Stop_word]].
  *
  * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that
  * contain token's lemma and part of speech accordingly. You can configure [[NCOpenNLPTokenEnricher]] with the model
  * for English language that would provide these metadata properties before this enricher in your [[NCPipeline pipeline]].
  *
  * @param addSet User defined collection of additional stopwords. These words will be stemmatized by the given `stemmer`
  *         before attempting to find a match. Default value is an empty set.
  * @param exclSet User defined collection of exceptions, i.e. the words which should not be marked as stopwords during
  *         processing. These words will be stemmatized by the given `stemmer` before attempting to find a match.
  *         Default value is an empty set.
  * @param stemmer English stemmer implementation. Default value is the instance of [[org.apache.nlpcraft.nlp.stemmer.NCEnStemmer NCEnStemmer]].
  */
class NCEnStopWordsTokenEnricher(
    addSet: Set[String] = Set.empty,
    exclSet: Set[String] = Set.empty,
    stemmer: NCStemmer = new NCEnStemmer
) extends NCTokenEnricher with LazyLogging:
    require(addSet != null, "Additional stopwords cannot be null.")
    require(exclSet != null, "Exceptions stopwords cannot be null.")
    require(stemmer != null, "Stemmer cannot be null.")

    private var addStems: Set[String] = _
    private var exclStems: Set[String] = _
    private var percents: Set[String] = _
    private var stopWords: StopWordHolder = _
    private var exceptions: StopWordHolder = _
    private var firstWords: Set[String] = _
    private var nounWords: Set[String] = _

    private case class TokenExtra(lemma: String, stemTxt: String, stemLemma: String)
    private object TokenExtra:
        def apply(t: NCToken): TokenExtra =
            val lemma = getLemma(t)
            new TokenExtra(lemma, getStem(t.getText), getStem(lemma))
    init()

    private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
    private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(getStem).mkString(" ")

    /**
      * stopword holder, used for hash search.
      *
      * @param any Any POSes container.
      * @param includes Included by POS container.
      * @param excludes Excluded by POS container.
      */
    private case class HashHolder(
        any: Set[Word],
        includes: Map[String, Set[Word]],
        excludes: Map[String, Set[Word]]
    ):
        def matches(s: String, posOpt: Option[String]): Boolean =
            posOpt match
                case Some(pos) =>
                    !excludes.getOrElse(pos, Set.empty).contains(s) &&
                        (any.contains(s) || includes.getOrElse(pos, Set.empty).contains(s))
                case _ => any.contains(s)

    /**
      * stopword holder, used for scanning.
      *
      * @param any Any POSes container.
      * @param includes Included by POS container.
      * @param excludes Excluded by POS container.
      */
    private case class ScanHolder(
        any: Set[Wildcard],
        includes: Map[String, Set[Wildcard]],
        excludes: Map[String, Set[Wildcard]]
    ):
        require(!any.exists { (begin, end) => begin.isEmpty && end.isEmpty })