def mkMap[T]()

in nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala [372:634]
163 lines of code
72 McCabe index (conditional complexity)

        def mkMap[T](mkT: Unit => T): Map[Key, T] =
            val m = mutable.Map.empty[Key, T]
            def add(f: WordForm, mkT: Unit => T, isExc: Boolean): Unit =
                val tuple: (Key, T) = (isExc, f) -> mkT(())
                m += tuple._1 -> tuple._2
            WordForm.values.foreach(f =>
                add(f, mkT, isExc = true)
                add(f, mkT, isExc = false)
            )
            m.toMap

        // Prepares collections.
        val mHash = mkMap(_ => new Condition[Word]())
        val mScan = mkMap(_ => new Condition[Wildcard]())

        // 2. Accumulates data of each parsed line.
        for (line <- lines)
            def throwError(msg: String): Unit = E(s"Invalid stop word configuration [line=$line, reason=$msg]")

            var s = line.trim

            // Word with size 1 word should contains letter only.
            if s.length == 1 && !s.head.isLetter then throwError("Invalid stop word")

            def checkSingle(ch: Char): Unit = if s.count(_ == ch) > 1 then throwError(s"Unexpected symbols count: $ch")

            // Confusing special symbols.
            checkSingle('@')
            checkSingle('|')
            checkSingle('*')

            val isExc = line.head == '~'
            if isExc then s = line.drop(1)
            val idxPos = s.indexOf("|")
            val poses: Map[String, Boolean] =
                if idxPos > 0 then
                    s.
                        drop(idxPos + 1).
                        trim.split(" ").
                        map(_.trim.toUpperCase).
                        filter(_.nonEmpty).
                        toSeq.
                        map(p => if p.head == '~' then p.drop(1).strip -> false else p -> true).
                        toMap
                else
                    Map.empty

            if !poses.keys.forall(POSES.contains) then throwError(s"Invalid POSes: ${poses.keys.mkString(", ")}")
            val hasPoses = poses.nonEmpty
            if hasPoses then s = s.take(idxPos).trim
            val isMultiWord = s.contains(' ')

            // Confusing POSes.
            if poses.nonEmpty && isMultiWord then throwError("POSes cannot be defined for multiple stopword.")
            var isCase = false
            if s.head == '@' then
                s = s.drop(1)
                // Empty word.
                if s.isEmpty then throwError("Empty word.")
                isCase = true
            val idxWild = s.indexOf("*")
            if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be defined for multiple stopword.")
            if idxWild < 0 then
                val (word, form) =
                    if isCase then (s, ORIG)
                    else if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM)
                mHash((isExc, form)).addCondition(word, poses)
            else
                val b = s.take(idxWild)
                val e = s.drop(idxWild + 1)

                if b.isEmpty && e.isEmpty && !hasPoses then throwError("Too general wildcard definition.")
                mScan((isExc, if isCase then ORIG else LEM)).addCondition((b, e), poses)

        // 3. Converts data to service format.
        def toImmutable[T](m: mutable.HashMap[String, mutable.HashSet[T]]): Map[String, Set[T]] = m.map(p => p._1 -> p._2.toSet).toMap

        Seq(true, false).map(isExc =>
            def mkHolder[T, R](
                m: Map[(Boolean, WordForm), Condition[T]],
                form: WordForm,
                mkInstance: (Set[T], Map[String, Set[T]], Map[String, Set[T]]) => R
            ): R =
                val any = m((isExc, form)).any.toSet
                val incl = toImmutable(m((isExc, form)).incls)
                val excl = toImmutable(m((isExc, form)).excls)
                mkInstance(any ++ excl.values.flatten, incl, excl)

            def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form, HashHolder.apply)
            def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form, ScanHolder.apply)

            isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
        ).toMap

    private def isVerb(pos: String): Boolean = pos.head == 'V'

    /**
      * Marks words before stopword.
      *
      * @param ns Sentence.
      * @param stopPoses Stop POSes.
      * @param lastIdx Last index.
      * @param isException Function which return stop word exception flag.
      * @param stops Stopwords tokens.
      */
    @tailrec
    private def markBefore(
        ns: Seq[NCToken],
        stopPoses: Seq[String],
        lastIdx: Int,
        isException: Seq[NCToken] => Boolean,
        stops: mutable.HashSet[NCToken]
    ): Boolean =
        var stop = true
        for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx && !isStopWord(tok) && !isException(Seq(tok)) &&
            stopPoses.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
            stops += tok
            stop = false
        if stop then true else markBefore(ns, stopPoses, lastIdx, isException, stops)

    /**
      * Checks value cached or not.
      *
      * @param toks Tokens.
      * @param cache Cache map.
      * @param get Calculation method based on given tokens.
      */
    private def exists(toks: Seq[NCToken], cache: mutable.HashMap[Seq[NCToken], Boolean], get: Seq[NCToken] => Boolean): Boolean =
        cache.get(toks) match
            case Some(b) => b
            case None =>
                val b = get(toks)
                cache += toks -> b
                b

    /**
      * Marks as stopwords, words with POS from configured list, which also placed before another stopword.
      */
    private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra], stops: mutable.HashSet[NCToken]): Unit =
        /**
          * Marks as stopwords, words with POS from configured list, which also placed before another stopword.
          */
        @tailrec
        def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra]): Unit =
            val max = ns.size - 1
            var stop = true

            for (
                (tok, idx) <- ns.zipWithIndex; extra = extraToks(tok)
                if
                    idx != max &&
                    !isStopWord(tok) &&
                    !exclStems.contains(extra.stemTxt) &&
                    !exclStems.contains(extra.stemLemma) &&
                    POSES.contains(getPos(tok)) &&
                    isStopWord(ns(idx + 1))
            )
                stops += tok
                stop = false

            if !stop then processCommonStops0(ns, extraToks)

        processCommonStops0(ns, extraToks)

    /** @inheritdoc */
    override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
        // stopword and exceptions caches for this sentence.
        val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
        val cacheEx = mutable.HashMap.empty[Seq[NCToken], Boolean]

        def isStop(toks: Seq[NCToken]): Boolean = exists(toks, cacheSw, stopWords.matches)
        def isException(toks: Seq[NCToken]): Boolean = exists(toks, cacheEx, exceptions.matches)

        val stops = mutable.HashSet.empty[NCToken]

        val extraToks =
            scala.collection.mutable.LinkedHashMap.empty[NCToken, TokenExtra] ++=
                toks.map(t => t -> TokenExtra(t))

        for ((tok, extra) <- extraToks)
            val idx = tok.getIndex
            val pos = getPos(tok)
            val lemma = extra.lemma
            val st = extra.stemTxt

            def isFirst: Boolean = idx == 0
            def isLast: Boolean = idx == toks.length - 1
            def next(): NCToken = toks(idx + 1)
            def prev(): NCToken = toks(idx - 1)
            def isCommonVerbs(firstVerb: String, secondVerb: String): Boolean =
                isVerb(pos) && lemma == secondVerb ||
                (isVerb(pos) && lemma == firstVerb && !isLast && isVerb(getPos(next())) && getLemma(next()) == secondVerb)

            // +---------------------------------+
            // | Pass #1.                        |
            // | POS tags and manual resolution. |
            // +---------------------------------+
            val stop = !isException(Seq(tok)) &&
                (// Percents after numbers.
                    // 1. Word from 'percentage' list.
                    percents.contains(st) &&
                        // 2. Number before.
                        !isFirst && getPos(prev()) == "CD" &&
                        // 3. It's last word or any words after except numbers.
                        (isLast || getPos(next()) != "CD")
                    ) ||
                // be, was, is etc. or has been etc.
                isCommonVerbs("have", "be") ||
                // be, was, is etc. or have done etc.
                isCommonVerbs("have", "do")
            if stop then stops += tok

        // +--------------------------------------+
        // | Pass #2.                             |
        // | Find all words from predefined list. |
        // +--------------------------------------+
        val buf = mutable.Buffer.empty[Seq[NCToken]]
        val mix = tokenMixWithStopWords(toks)

        for (toks <- mix if !buf.exists(_.containsSlice(toks)) && isStop(toks) && !isException(toks))
            toks.foreach(tok => stops += tok)
            buf += toks

        // Capture the token mix at this point minus the initial stopword found up to this point.
        val origToks: Seq[(Seq[NCToken], String)] =
            (for (toks <- mix) yield toks.toSeq).map(s => s -> toStemKey(s)).toSeq

        // +--------------------------------------------------+
        // | Pass #3.                                         |
        // | Check for sentence beginners from external file. |
        // +--------------------------------------------------+

        val foundKeys = new mutable.HashSet[String]()

        // All sentence first stopword + first non stop word.
        val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => !isStopWord(p)).map(p => p)
        for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
            tup._1.foreach(tok => stops += tok)
            foundKeys += key

        // +-------------------------------------------------+
        // | Pass #4.                                        |
        // | Check for sentence beginners with ending nouns. |
        // +-------------------------------------------------+
        for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
            foundKeys.find(key.startsWith) match
                case Some(s) => if nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok)
                case None => ()

        // +-------------------------------------------------+
        // | Pass #5.                                        |
        // | Mark words with POSes before stopwords.         |
        // +-------------------------------------------------+
        markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)

        // +-------------------------------------------------+
        // | Pass #6.                                        |
        // | Processing additional and excluded stopword.    |
        // +-------------------------------------------------+
        def has(set: Set[String], extra: TokenExtra) = set.contains(extra.stemTxt) || set.contains(extra.stemLemma)

        for ((t, extra) <- extraToks if has(addStems, extra)) stops += t
        for ((t, _) <- stops.map(t => t -> extraToks(t)).filter { (_, extra) => has(exclSet, extra)}) stops -= t