in src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs [881:1190]
private void Backtrace(Position endPosData, int fromIDX)
{
int endPos = endPosData.pos;
/*
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
* empty buffer
*/
if (endPos == lastBackTracePos)
{
return;
}
if (VERBOSE)
{
Console.WriteLine("\n backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]);
}
char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos);
if (dotOut != null)
{
dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
}
int pos = endPos;
int bestIDX = fromIDX;
Token altToken = null;
// We trace backwards, so this will be the leftWordID of
// the token after the one we are now on:
int lastLeftWordID = -1;
int backCount = 0;
// TODO: sort of silly to make Token instances here; the
// back trace has all info needed to generate the
// token. So, we could just directly set the attrs,
// from the backtrace, in incrementToken w/o ever
// creating Token; we'd have to defer calling freeBefore
// until after the backtrace was fully "consumed" by
// incrementToken.
while (pos > lastBackTracePos)
{
//System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
Position posData = positions.Get(pos);
if (Debugging.AssertsEnabled) Debugging.Assert(bestIDX < posData.count);
int backPos = posData.backPos[bestIDX];
if (Debugging.AssertsEnabled) Debugging.Assert(backPos >= lastBackTracePos,"backPos={0} vs lastBackTracePos={1}", backPos, lastBackTracePos);
int length = pos - backPos;
JapaneseTokenizerType backType = posData.backType[bestIDX];
int backID = posData.backID[bestIDX];
int nextBestIDX = posData.backIndex[bestIDX];
if (outputCompounds && searchMode && altToken is null && backType != JapaneseTokenizerType.USER)
{
// In searchMode, if best path had picked a too-long
// token, we use the "penalty" to compute the allowed
// max cost of an alternate back-trace. If we find an
// alternate back trace with cost below that
// threshold, we pursue it instead (but also output
// the long token).
//System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos);
int penalty = ComputeSecondBestThreshold(backPos, pos - backPos);
if (penalty > 0)
{
if (VERBOSE)
{
Console.WriteLine(" compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID);
}
// Use the penalty to set maxCost on the 2nd best
// segmentation:
int maxCost = posData.costs[bestIDX] + penalty;
if (lastLeftWordID != -1)
{
maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID);
}
// Now, prune all too-long tokens from the graph:
PruneAndRescore(backPos, pos,
posData.backIndex[bestIDX]);
// Finally, find 2nd best back-trace and resume
// backtrace there:
int leastCost = int.MaxValue;
int leastIDX = -1;
for (int idx = 0; idx < posData.count; idx++)
{
int cost = posData.costs[idx];
//System.out.println(" idx=" + idx + " prevCost=" + cost);
if (lastLeftWordID != -1)
{
cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]),
lastLeftWordID);
//System.out.println(" += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
//lastLeftWordID) + " -> " + cost);
}
//System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
//cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
if (cost < leastCost)
{
//System.out.println(" ** ");
leastCost = cost;
leastIDX = idx;
}
}
//System.out.println(" leastIDX=" + leastIDX);
if (VERBOSE)
{
Console.WriteLine(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID);
}
if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos)
{
// We should have pruned the altToken from the graph:
if (Debugging.AssertsEnabled) Debugging.Assert(posData.backPos[leastIDX] != backPos);
// Save the current compound token, to output when
// this alternate path joins back:
altToken = new Token(backID,
fragment,
backPos - lastBackTracePos,
length,
backType,
backPos,
GetDict(backType));
// Redirect our backtrace to 2nd best:
bestIDX = leastIDX;
nextBestIDX = posData.backIndex[bestIDX];
backPos = posData.backPos[bestIDX];
length = pos - backPos;
backType = posData.backType[bestIDX];
backID = posData.backID[bestIDX];
backCount = 0;
//System.out.println(" do alt token!");
}
else
{
// I think in theory it's possible there is no
// 2nd best path, which is fine; in this case we
// only output the compound token:
//System.out.println(" no alt token! bestIDX=" + bestIDX);
}
}
}
int offset = backPos - lastBackTracePos;
if (Debugging.AssertsEnabled) Debugging.Assert(offset >= 0);
if (altToken != null && altToken.Position >= backPos)
{
// We've backtraced to the position where the
// compound token starts; add it now:
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
if (Debugging.AssertsEnabled) Debugging.Assert(altToken.Position == backPos, "{0} vs {1}", altToken.Position, backPos);
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (backCount > 0)
{
backCount++;
altToken.PositionLength = backCount;
if (VERBOSE)
{
Console.WriteLine(" add altToken=" + altToken);
}
pending.Add(altToken);
}
else
{
// This means alt token was all punct tokens:
if (VERBOSE)
{
Console.WriteLine(" discard all-punctuation altToken=" + altToken);
}
if (Debugging.AssertsEnabled) Debugging.Assert(discardPunctuation);
}
altToken = null;
}
IDictionary dict = GetDict(backType);
if (backType == JapaneseTokenizerType.USER)
{
// Expand the phraseID we recorded into the actual
// segmentation:
int[] wordIDAndLength = userDictionary.LookupSegmentation(backID);
int wordID = wordIDAndLength[0];
int current = 0;
for (int j = 1; j < wordIDAndLength.Length; j++)
{
int len = wordIDAndLength[j];
//System.out.println(" add user: len=" + len);
pending.Add(new Token(wordID + j - 1,
fragment,
current + offset,
len,
JapaneseTokenizerType.USER,
current + backPos,
dict));
if (VERBOSE)
{
Console.WriteLine(" add USER token=" + pending[pending.Count - 1]);
}
current += len;
}
// Reverse the tokens we just added, because when we
// serve them up from incrementToken we serve in
// reverse:
pending.Reverse(pending.Count - (wordIDAndLength.Length - 1),
wordIDAndLength.Length - 1); // LUCENENET: Converted from reverse on SubList to reverse on List<T> and converted end index to length
backCount += wordIDAndLength.Length - 1;
}
else
{
if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN)
{
// In EXTENDED mode we convert unknown word into
// unigrams:
int unigramTokenCount = 0;
for (int i = length - 1; i >= 0; i--)
{
int charLen = 1;
if (i > 0 && char.IsLowSurrogate(fragment[offset + i]))
{
i--;
charLen = 2;
}
//System.out.println(" extended tok offset="
//+ (offset + i));
if (!discardPunctuation || !IsPunctuation(fragment[offset + i]))
{
pending.Add(new Token(CharacterDefinition.NGRAM,
fragment,
offset + i,
charLen,
JapaneseTokenizerType.UNKNOWN,
backPos + i,
unkDictionary));
unigramTokenCount++;
}
}
backCount += unigramTokenCount;
}
else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset]))
{
pending.Add(new Token(backID,
fragment,
offset,
length,
backType,
backPos,
dict));
if (VERBOSE)
{
Console.WriteLine(" add token=" + pending[pending.Count - 1]);
}
backCount++;
}
else
{
if (VERBOSE)
{
Console.WriteLine(" skip punctuation token=" + new string(fragment, offset, length));
}
}
}
lastLeftWordID = dict.GetLeftId(backID);
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE)
{
Console.WriteLine(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.FreeBefore(endPos);
positions.FreeBefore(endPos);
}