private void Backtrace()

in src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs [881:1190]


        private void Backtrace(Position endPosData, int fromIDX)
        {
            int endPos = endPosData.pos;

            /*
             * LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
             * avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
             * empty buffer
             */
            if (endPos == lastBackTracePos)
            {
                return;
            }

            if (VERBOSE)
            {
                Console.WriteLine("\n  backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
            }

            char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos);

            if (dotOut != null)
            {
                dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
            }

            int pos = endPos;
            int bestIDX = fromIDX;
            Token altToken = null;

            // We trace backwards, so this will be the leftWordID of
            // the token after the one we are now on:
            int lastLeftWordID = -1;

            int backCount = 0;

            // TODO: sort of silly to make Token instances here; the
            // back trace has all info needed to generate the
            // token.  So, we could just directly set the attrs,
            // from the backtrace, in incrementToken w/o ever
            // creating Token; we'd have to defer calling freeBefore
            // until after the backtrace was fully "consumed" by
            // incrementToken.

            while (pos > lastBackTracePos)
            {
                //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
                Position posData = positions.Get(pos);
                if (Debugging.AssertsEnabled) Debugging.Assert(bestIDX < posData.count);

                int backPos = posData.backPos[bestIDX];
                if (Debugging.AssertsEnabled) Debugging.Assert(backPos >= lastBackTracePos,"backPos={0} vs lastBackTracePos={1}", backPos, lastBackTracePos);
                int length = pos - backPos;
                JapaneseTokenizerType backType = posData.backType[bestIDX];
                int backID = posData.backID[bestIDX];
                int nextBestIDX = posData.backIndex[bestIDX];

                if (outputCompounds && searchMode && altToken is null && backType != JapaneseTokenizerType.USER)
                {

                    // In searchMode, if best path had picked a too-long
                    // token, we use the "penalty" to compute the allowed
                    // max cost of an alternate back-trace.  If we find an
                    // alternate back trace with cost below that
                    // threshold, we pursue it instead (but also output
                    // the long token).
                    //System.out.println("    2nd best backPos=" + backPos + " pos=" + pos);

                    int penalty = ComputeSecondBestThreshold(backPos, pos - backPos);

                    if (penalty > 0)
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("  compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
                        }

                        // Use the penalty to set maxCost on the 2nd best
                        // segmentation:
                        int maxCost = posData.costs[bestIDX] + penalty;
                        if (lastLeftWordID != -1)
                        {
                            maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID);
                        }

                        // Now, prune all too-long tokens from the graph:
                        PruneAndRescore(backPos, pos,
                                        posData.backIndex[bestIDX]);

                        // Finally, find 2nd best back-trace and resume
                        // backtrace there:
                        int leastCost = int.MaxValue;
                        int leastIDX = -1;
                        for (int idx = 0; idx < posData.count; idx++)
                        {
                            int cost = posData.costs[idx];
                            //System.out.println("    idx=" + idx + " prevCost=" + cost);

                            if (lastLeftWordID != -1)
                            {
                                cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]),
                                                  lastLeftWordID);
                                //System.out.println("      += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
                                //lastLeftWordID) + " -> " + cost);
                            }
                            //System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
                            //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
                            if (cost < leastCost)
                            {
                                //System.out.println("      ** ");
                                leastCost = cost;
                                leastIDX = idx;
                            }
                        }
                        //System.out.println("  leastIDX=" + leastIDX);

                        if (VERBOSE)
                        {
                            Console.WriteLine("  afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
                        }

                        if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos)
                        {
                            // We should have pruned the altToken from the graph:
                            if (Debugging.AssertsEnabled) Debugging.Assert(posData.backPos[leastIDX] != backPos);

                            // Save the current compound token, to output when
                            // this alternate path joins back:
                            altToken = new Token(backID,
                                                 fragment,
                                                 backPos - lastBackTracePos,
                                                 length,
                                                 backType,
                                                 backPos,
                                                 GetDict(backType));

                            // Redirect our backtrace to 2nd best:
                            bestIDX = leastIDX;
                            nextBestIDX = posData.backIndex[bestIDX];

                            backPos = posData.backPos[bestIDX];
                            length = pos - backPos;
                            backType = posData.backType[bestIDX];
                            backID = posData.backID[bestIDX];
                            backCount = 0;
                            //System.out.println("  do alt token!");

                        }
                        else
                        {
                            // I think in theory it's possible there is no
                            // 2nd best path, which is fine; in this case we
                            // only output the compound token:
                            //System.out.println("  no alt token! bestIDX=" + bestIDX);
                        }
                    }
                }

                int offset = backPos - lastBackTracePos;
                if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);

                if (altToken != null && altToken.Position >= backPos)
                {

                    // We've backtraced to the position where the
                    // compound token starts; add it now:

                    // The pruning we did when we created the altToken
                    // ensures that the back trace will align back with
                    // the start of the altToken:
                    if (Debugging.AssertsEnabled) Debugging.Assert(altToken.Position == backPos, "{0} vs {1}", altToken.Position, backPos);

                    // NOTE: not quite right: the compound token may
                    // have had all punctuation back traced so far, but
                    // then the decompounded token at this position is
                    // not punctuation.  In this case backCount is 0,
                    // but we should maybe add the altToken anyway...?

                    if (backCount > 0)
                    {
                        backCount++;
                        altToken.PositionLength = backCount;
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add altToken=" + altToken);
                        }
                        pending.Add(altToken);
                    }
                    else
                    {
                        // This means alt token was all punct tokens:
                        if (VERBOSE)
                        {
                            Console.WriteLine("    discard all-punctuation altToken=" + altToken);
                        }
                        if (Debugging.AssertsEnabled) Debugging.Assert(discardPunctuation);
                    }
                    altToken = null;
                }

                IDictionary dict = GetDict(backType);

                if (backType == JapaneseTokenizerType.USER)
                {

                    // Expand the phraseID we recorded into the actual
                    // segmentation:
                    int[] wordIDAndLength = userDictionary.LookupSegmentation(backID);
                    int wordID = wordIDAndLength[0];
                    int current = 0;
                    for (int j = 1; j < wordIDAndLength.Length; j++)
                    {
                        int len = wordIDAndLength[j];
                        //System.out.println("    add user: len=" + len);
                        pending.Add(new Token(wordID + j - 1,
                                              fragment,
                                              current + offset,
                                              len,
                                              JapaneseTokenizerType.USER,
                                              current + backPos,
                                              dict));
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add USER token=" + pending[pending.Count - 1]);
                        }
                        current += len;
                    }

                    // Reverse the tokens we just added, because when we
                    // serve them up from incrementToken we serve in
                    // reverse:
                    pending.Reverse(pending.Count - (wordIDAndLength.Length - 1),
                        wordIDAndLength.Length - 1); // LUCENENET: Converted from reverse on SubList to reverse on List<T> and converted end index to length


                    backCount += wordIDAndLength.Length - 1;
                }
                else
                {

                    if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN)
                    {
                        // In EXTENDED mode we convert unknown word into
                        // unigrams:
                        int unigramTokenCount = 0;
                        for (int i = length - 1; i >= 0; i--)
                        {
                            int charLen = 1;
                            if (i > 0 && char.IsLowSurrogate(fragment[offset + i]))
                            {
                                i--;
                                charLen = 2;
                            }
                            //System.out.println("    extended tok offset="
                            //+ (offset + i));
                            if (!discardPunctuation || !IsPunctuation(fragment[offset + i]))
                            {
                                pending.Add(new Token(CharacterDefinition.NGRAM,
                                                      fragment,
                                                      offset + i,
                                                      charLen,
                                                      JapaneseTokenizerType.UNKNOWN,
                                                      backPos + i,
                                                      unkDictionary));
                                unigramTokenCount++;
                            }
                        }
                        backCount += unigramTokenCount;

                    }
                    else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset]))
                    {
                        pending.Add(new Token(backID,
                                              fragment,
                                              offset,
                                              length,
                                              backType,
                                              backPos,
                                              dict));
                        if (VERBOSE)
                        {
                            Console.WriteLine("    add token=" + pending[pending.Count - 1]);
                        }
                        backCount++;
                    }
                    else
                    {
                        if (VERBOSE)
                        {
                            Console.WriteLine("    skip punctuation token=" + new string(fragment, offset, length));
                        }
                    }
                }

                lastLeftWordID = dict.GetLeftId(backID);
                pos = backPos;
                bestIDX = nextBestIDX;
            }

            lastBackTracePos = endPos;

            if (VERBOSE)
            {
                Console.WriteLine("  freeBefore pos=" + endPos);
            }
            // Notify the circular buffers that we are done with
            // these positions:
            buffer.FreeBefore(endPos);
            positions.FreeBefore(endPos);
        }