in src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs [429:746]
private void Parse()
{
if (VERBOSE)
{
Console.WriteLine("\nPARSE");
}
// Advances over each position (character):
while (true)
{
if (buffer.Get(pos) == -1)
{
// End
break;
}
Position posData = positions.Get(pos);
bool isFrontier = positions.GetNextPos() == pos + 1;
if (posData.count == 0)
{
// No arcs arrive here; move to next position:
if (VERBOSE)
{
Console.WriteLine(" no arcs in; skip pos=" + pos);
}
pos++;
continue;
}
if (pos > lastBackTracePos && posData.count == 1 && isFrontier)
{
// if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
// We are at a "frontier", and only one node is
// alive, so whatever the eventual best path is must
// come through this node. So we can safely commit
// to the prefix of the best path at this point:
Backtrace(posData, 0);
// Re-base cost so we don't risk int overflow:
posData.costs[0] = 0;
if (pending.Count != 0)
{
return;
}
else
{
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
}
}
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP)
{
// Safety: if we've buffered too much, force a
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total best path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost partial path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = int.MaxValue;
Position leastPosData = null;
for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
{
Position posData2 = positions.Get(pos2);
for (int idx = 0; idx < posData2.count; idx++)
{
//System.out.println(" idx=" + idx + " cost=" + cost);
int cost = posData2.costs[idx];
if (cost < leastCost)
{
leastCost = cost;
leastIDX = idx;
leastPosData = posData2;
}
}
}
// We will always have at least one live path:
if (Debugging.AssertsEnabled) Debugging.Assert(leastIDX != -1);
// Second pass: prune all but the best path:
for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++)
{
Position posData2 = positions.Get(pos2);
if (posData2 != leastPosData)
{
posData2.Reset();
}
else
{
if (leastIDX != 0)
{
posData2.costs[0] = posData2.costs[leastIDX];
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
posData2.backPos[0] = posData2.backPos[leastIDX];
posData2.backIndex[0] = posData2.backIndex[leastIDX];
posData2.backID[0] = posData2.backID[leastIDX];
posData2.backType[0] = posData2.backType[leastIDX];
}
posData2.count = 1;
}
}
Backtrace(leastPosData, 0);
// Re-base cost so we don't risk int overflow:
Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pos != leastPosData.pos)
{
// We jumped into a future position:
if (Debugging.AssertsEnabled) Debugging.Assert(pos < leastPosData.pos);
pos = leastPosData.pos;
}
if (pending.Count != 0)
{
return;
}
else
{
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
continue;
}
}
if (VERBOSE)
{
Console.WriteLine("\n extend @ pos=" + pos + " char=" + (char)buffer.Get(pos));
}
if (VERBOSE)
{
Console.WriteLine(" " + posData.count + " arcs in");
}
bool anyMatches = false;
// First try user dict:
if (userFST != null)
{
userFST.GetFirstArc(arc);
int output = 0;
for (int posAhead = posData.pos; ; posAhead++)
{
int ch = buffer.Get(posAhead);
if (ch == -1)
{
break;
}
if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) is null)
{
break;
}
output += (int)arc.Output;
if (arc.IsFinal)
{
if (VERBOSE)
{
Console.WriteLine(" USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1));
}
Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false);
anyMatches = true;
}
}
}
// TODO: we can be more aggressive about user
// matches? if we are "under" a user match then don't
// extend KNOWN/UNKNOWN paths?
if (!anyMatches)
{
// Next, try known dictionary matches
fst.GetFirstArc(arc);
int output = 0;
for (int posAhead = posData.pos; ; posAhead++)
{
int ch = buffer.Get(posAhead);
if (ch == -1)
{
break;
}
//System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) is null)
{
break;
}
output += (int)arc.Output;
// Optimization: for known words that are too-long
// (compound), we should pre-compute the 2nd
// best segmentation and store it in the
// dictionary instead of recomputing it each time a
// match is found.
if (arc.IsFinal)
{
dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef);
if (VERBOSE)
{
Console.WriteLine(" KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
{
Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false);
anyMatches = true;
}
}
}
}
// In the case of normal mode, it doesn't process unknown word greedily.
if (!searchMode && unknownWordEndIndex > posData.pos)
{
pos++;
continue;
}
char firstCharacter = (char)buffer.Get(pos);
if (!anyMatches || characterDefinition.IsInvoke(firstCharacter))
{
// Find unknown match:
int characterId = characterDefinition.GetCharacterClass(firstCharacter);
bool isPunct = IsPunctuation(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
if (!characterDefinition.IsGroup(firstCharacter))
{
unknownWordLength = 1;
}
else
{
// Extract unknown word. Characters with the same character class are considered to be part of unknown word
unknownWordLength = 1;
for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++)
{
int ch = buffer.Get(posAhead);
if (ch == -1)
{
break;
}
if (characterId == characterDefinition.GetCharacterClass((char)ch) &&
IsPunctuation((char)ch) == isPunct)
{
unknownWordLength++;
}
else
{
break;
}
}
}
unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
if (VERBOSE)
{
Console.WriteLine(" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.Length; ofs++)
{
Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false);
}
unknownWordEndIndex = posData.pos + unknownWordLength;
}
pos++;
}
end = true;
if (pos > 0)
{
Position endPosData = positions.Get(pos);
int leastCost = int.MaxValue;
int leastIDX = -1;
if (VERBOSE)
{
Console.WriteLine(" end: " + endPosData.count + " nodes");
}
for (int idx = 0; idx < endPosData.count; idx++)
{
// Add EOS cost:
int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0);
//System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]);
if (cost < leastCost)
{
leastCost = cost;
leastIDX = idx;
}
}
Backtrace(endPosData, leastIDX);
}
else
{
// No characters in the input string; return no tokens!
}
}