private void Parse()

in src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs [429:746]


        private void Parse()
        {
            if (VERBOSE)
            {
                Console.WriteLine("\nPARSE");
            }

            // Advances over each position (character):
            while (true)
            {

                if (buffer.Get(pos) == -1)
                {
                    // End
                    break;
                }

                Position posData = positions.Get(pos);
                bool isFrontier = positions.GetNextPos() == pos + 1;

                if (posData.count == 0)
                {
                    // No arcs arrive here; move to next position:
                    if (VERBOSE)
                    {
                        Console.WriteLine("    no arcs in; skip pos=" + pos);
                    }
                    pos++;
                    continue;
                }

                if (pos > lastBackTracePos && posData.count == 1 && isFrontier)
                {
                    //  if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
                    // We are at a "frontier", and only one node is
                    // alive, so whatever the eventual best path is must
                    // come through this node.  So we can safely commit
                    // to the prefix of the best path at this point:
                    Backtrace(posData, 0);

                    // Re-base cost so we don't risk int overflow:
                    posData.costs[0] = 0;

                    if (pending.Count != 0)
                    {
                        return;
                    }
                    else
                    {
                        // This means the backtrace only produced
                        // punctuation tokens, so we must keep parsing.
                    }
                }

                if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP)
                {
                    // Safety: if we've buffered too much, force a
                    // backtrace now.  We find the least-cost partial
                    // path, across all paths, backtrace from it, and
                    // then prune all others.  Note that this, in
                    // general, can produce the wrong result, if the
                    // total best path did not in fact back trace
                    // through this partial best path.  But it's the
                    // best we can do... (short of not having a
                    // safety!).

                    // First pass: find least cost partial path so far,
                    // including ending at future positions:
                    int leastIDX = -1;
                    int leastCost = int.MaxValue;
                    Position leastPosData = null;
                    for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
                    {
                        Position posData2 = positions.Get(pos2);
                        for (int idx = 0; idx < posData2.count; idx++)
                        {
                            //System.out.println("    idx=" + idx + " cost=" + cost);
                            int cost = posData2.costs[idx];
                            if (cost < leastCost)
                            {
                                leastCost = cost;
                                leastIDX = idx;
                                leastPosData = posData2;
                            }
                        }
                    }

                    // We will always have at least one live path:
                    if (Debugging.AssertsEnabled) Debugging.Assert(leastIDX != -1);

                    // Second pass: prune all but the best path:
                    for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
                    {
                        Position posData2 = positions.Get(pos2);
                        if (posData2 != leastPosData)
                        {
                            posData2.Reset();
                        }
                        else
                        {
                            if (leastIDX != 0)
                            {
                                posData2.costs[0] = posData2.costs[leastIDX];
                                posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
                                posData2.backPos[0] = posData2.backPos[leastIDX];
                                posData2.backIndex[0] = posData2.backIndex[leastIDX];
                                posData2.backID[0] = posData2.backID[leastIDX];
                                posData2.backType[0] = posData2.backType[leastIDX];
                            }
                            posData2.count = 1;
                        }
                    }

                    Backtrace(leastPosData, 0);

                    // Re-base cost so we don't risk int overflow:
                    Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0);

                    if (pos != leastPosData.pos)
                    {
                        // We jumped into a future position:
                        if (Debugging.AssertsEnabled) Debugging.Assert(pos < leastPosData.pos);
                        pos = leastPosData.pos;
                    }

                    if (pending.Count != 0)
                    {
                        return;
                    }
                    else
                    {
                        // This means the backtrace only produced
                        // punctuation tokens, so we must keep parsing.
                        continue;
                    }
                }

                if (VERBOSE)
                {
                    Console.WriteLine("\n  extend @ pos=" + pos + " char=" + (char)buffer.Get(pos));
                }

                if (VERBOSE)
                {
                    Console.WriteLine("    " + posData.count + " arcs in");
                }

                bool anyMatches = false;

                // First try user dict:
                if (userFST != null)
                {
                    userFST.GetFirstArc(arc);
                    int output = 0;
                    for (int posAhead = posData.pos; ; posAhead++)
                    {
                        int ch = buffer.Get(posAhead);
                        if (ch == -1)
                        {
                            break;
                        }
                        if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) is null)
                        {
                            break;
                        }
                        output += (int)arc.Output;
                        if (arc.IsFinal)
                        {
                            if (VERBOSE)
                            {
                                Console.WriteLine("    USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
                            }
                            Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false);
                            anyMatches = true;
                        }
                    }
                }

                // TODO: we can be more aggressive about user
                // matches?  if we are "under" a user match then don't
                // extend KNOWN/UNKNOWN paths?

                if (!anyMatches)
                {
                    // Next, try known dictionary matches
                    fst.GetFirstArc(arc);
                    int output = 0;

                    for (int posAhead = posData.pos; ; posAhead++)
                    {
                        int ch = buffer.Get(posAhead);
                        if (ch == -1)
                        {
                            break;
                        }
                        //System.out.println("    match " + (char) ch + " posAhead=" + posAhead);

                        if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) is null)
                        {
                            break;
                        }

                        output += (int)arc.Output;

                        // Optimization: for known words that are too-long
                        // (compound), we should pre-compute the 2nd
                        // best segmentation and store it in the
                        // dictionary instead of recomputing it each time a
                        // match is found.

                        if (arc.IsFinal)
                        {
                            dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef);
                            if (VERBOSE)
                            {
                                Console.WriteLine("    KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs");
                            }
                            for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
                            {
                                Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false);
                                anyMatches = true;
                            }
                        }
                    }
                }

                // In the case of normal mode, it doesn't process unknown word greedily.

                if (!searchMode && unknownWordEndIndex > posData.pos)
                {
                    pos++;
                    continue;
                }

                char firstCharacter = (char)buffer.Get(pos);
                if (!anyMatches || characterDefinition.IsInvoke(firstCharacter))
                {

                    // Find unknown match:
                    int characterId = characterDefinition.GetCharacterClass(firstCharacter);
                    bool isPunct = IsPunctuation(firstCharacter);

                    // NOTE: copied from UnknownDictionary.lookup:
                    int unknownWordLength;
                    if (!characterDefinition.IsGroup(firstCharacter))
                    {
                        unknownWordLength = 1;
                    }
                    else
                    {
                        // Extract unknown word. Characters with the same character class are considered to be part of unknown word
                        unknownWordLength = 1;
                        for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++)
                        {
                            int ch = buffer.Get(posAhead);
                            if (ch == -1)
                            {
                                break;
                            }
                            if (characterId == characterDefinition.GetCharacterClass((char)ch) &&
                                IsPunctuation((char)ch) == isPunct)
                            {
                                unknownWordLength++;
                            }
                            else
                            {
                                break;
                            }
                        }
                    }

                    unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
                    if (VERBOSE)
                    {
                        Console.WriteLine("    UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs");
                    }
                    for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
                    {
                        Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false);
                    }

                    unknownWordEndIndex = posData.pos + unknownWordLength;
                }

                pos++;
            }

            end = true;

            if (pos > 0)
            {

                Position endPosData = positions.Get(pos);
                int leastCost = int.MaxValue;
                int leastIDX = -1;
                if (VERBOSE)
                {
                    Console.WriteLine("  end: " + endPosData.count + " nodes");
                }
                for (int idx = 0; idx < endPosData.count; idx++)
                {
                    // Add EOS cost:
                    int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0);
                    //System.out.println("    idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
                    if (cost < leastCost)
                    {
                        leastCost = cost;
                        leastIDX = idx;
                    }
                }

                Backtrace(endPosData, leastIDX);
            }
            else
            {
                // No characters in the input string; return no tokens!
            }
        }