in lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ViterbiNBest.java [179:502]
protected void backtrace(Position endPosData, int fromIDX) throws IOException {
final int endPos = endPosData.getPos();
/*
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
* empty buffer
*/
if (endPos == lastBackTracePos) {
return;
}
if (VERBOSE) {
System.out.println(
"\n backtrace: endPos="
+ endPos
+ " pos="
+ pos
+ "; "
+ (pos - lastBackTracePos)
+ " characters; last="
+ lastBackTracePos
+ " cost="
+ endPosData.getCost(fromIDX));
}
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
if (dotOut != null) {
dotOut.onBacktrace(
this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
}
int pos = endPos;
int bestIDX = fromIDX;
Token altToken = null;
// We trace backwards, so this will be the leftWordID of
// the token after the one we are now on:
int lastLeftWordID = -1;
int backCount = 0;
// TODO: sort of silly to make Token instances here; the
// back trace has all info needed to generate the
// token. So, we could just directly set the attrs,
// from the backtrace, in incrementToken w/o ever
// creating Token; we'd have to defer calling freeBefore
// until after the backtrace was fully "consumed" by
// incrementToken.
while (pos > lastBackTracePos) {
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
final Position posData = positions.get(pos);
assert bestIDX < posData.getCount();
int backPos = posData.getBackPos(bestIDX);
assert backPos >= lastBackTracePos
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
int length = pos - backPos;
TokenType backType = posData.getBackType(bestIDX);
int backID = posData.getBackID(bestIDX);
int nextBestIDX = posData.getBackIndex(bestIDX);
if (searchMode && altToken == null && backType != TokenType.USER) {
// In searchMode, if best path had picked a too-long
// token, we use the "penalty" to compute the allowed
// max cost of an alternate back-trace. If we find an
// alternate back trace with cost below that
// threshold, we pursue it instead (but also output
// the long token).
// System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos);
final int penalty = computeSecondBestThreshold(backPos, pos - backPos);
if (penalty > 0) {
if (VERBOSE) {
System.out.println(
" compound="
+ new String(buffer.get(backPos, pos - backPos))
+ " backPos="
+ backPos
+ " pos="
+ pos
+ " penalty="
+ penalty
+ " cost="
+ posData.getCost(bestIDX)
+ " bestIDX="
+ bestIDX
+ " lastLeftID="
+ lastLeftWordID);
}
// Use the penalty to set maxCost on the 2nd best
// segmentation:
int maxCost = posData.getCost(bestIDX) + penalty;
if (lastLeftWordID != -1) {
maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID);
}
// Now, prune all too-long tokens from the graph:
pruneAndRescore(backPos, pos, posData.getBackIndex(bestIDX));
// Finally, find 2nd best back-trace and resume
// backtrace there:
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
for (int idx = 0; idx < posData.getCount(); idx++) {
int cost = posData.getCost(idx);
// System.out.println(" idx=" + idx + " prevCost=" + cost);
if (lastLeftWordID != -1) {
cost +=
costs.get(
getDict(posData.getBackType(idx)).getRightId(posData.getBackID(idx)),
lastLeftWordID);
// System.out.println(" += bgCost=" +
// costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]),
// lastLeftWordID) + " -> " + cost);
}
// System.out.println("penalty " + posData.backPos[idx] + " to " + pos);
// cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]);
if (cost < leastCost) {
// System.out.println(" ** ");
leastCost = cost;
leastIDX = idx;
}
}
// System.out.println(" leastIDX=" + leastIDX);
if (VERBOSE) {
System.out.println(
" afterPrune: "
+ posData.getCount()
+ " arcs arriving; leastCost="
+ leastCost
+ " vs threshold="
+ maxCost
+ " lastLeftWordID="
+ lastLeftWordID);
}
if (leastIDX != -1 && leastCost <= maxCost && posData.getBackPos(leastIDX) != backPos) {
// We should have pruned the altToken from the graph:
assert posData.getBackPos(leastIDX) != backPos;
// Save the current compound token, to output when
// this alternate path joins back:
altToken =
new Token(
fragment,
backPos - lastBackTracePos,
length,
backPos,
backPos + length,
backID,
backType,
getDict(backType).getMorphAttributes());
// Redirect our backtrace to 2nd best:
bestIDX = leastIDX;
nextBestIDX = posData.getBackIndex(bestIDX);
backPos = posData.getBackPos(bestIDX);
length = pos - backPos;
backType = posData.getBackType(bestIDX);
backID = posData.getBackID(bestIDX);
backCount = 0;
// System.out.println(" do alt token!");
} else {
// I think in theory it's possible there is no
// 2nd best path, which is fine; in this case we
// only output the compound token:
// System.out.println(" no alt token! bestIDX=" + bestIDX);
}
}
}
final int offset = backPos - lastBackTracePos;
assert offset >= 0;
if (altToken != null && altToken.getStartOffset() >= backPos) {
if (outputCompounds) {
// We've backtraced to the position where the
// compound token starts; add it now:
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
assert altToken.getStartOffset() == backPos
: altToken.getStartOffset() + " vs " + backPos;
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
}
altToken = null;
}
final Dictionary<? extends JaMorphData> dict = getDict(backType);
if (backType == TokenType.USER) {
// Expand the phraseID we recorded into the actual
// segmentation:
final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID);
int wordID = wordIDAndLength[0];
int current = 0;
for (int j = 1; j < wordIDAndLength.length; j++) {
final int len = wordIDAndLength[j];
// System.out.println(" add user: len=" + len);
int startOffset = current + backPos;
pending.add(
new Token(
fragment,
current + offset,
len,
startOffset,
startOffset + len,
wordID + j - 1,
TokenType.USER,
dict.getMorphAttributes()));
if (VERBOSE) {
System.out.println(" add USER token=" + pending.get(pending.size() - 1));
}
current += len;
}
// Reverse the tokens we just added, because when we
// serve them up from incrementToken we serve in
// reverse:
Collections.reverse(
pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size()));
backCount += wordIDAndLength.length - 1;
} else {
if (extendedMode && backType == TokenType.UNKNOWN) {
// In EXTENDED mode we convert unknown word into
// unigrams:
int unigramTokenCount = 0;
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) {
i--;
charLen = 2;
}
// System.out.println(" extended tok offset="
// + (offset + i));
if (!discardPunctuation || !isPunctuation(fragment[offset + i])) {
int startOffset = backPos + i;
pending.add(
new Token(
fragment,
offset + i,
charLen,
startOffset,
startOffset + charLen,
CharacterDefinition.NGRAM,
TokenType.UNKNOWN,
unkDictionary.getMorphAttributes()));
unigramTokenCount++;
}
}
backCount += unigramTokenCount;
} else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
pending.add(
new Token(
fragment,
offset,
length,
backPos,
backPos + length,
backID,
backType,
dict.getMorphAttributes()));
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
backCount++;
} else {
if (VERBOSE) {
System.out.println(
" skip punctuation token=" + new String(fragment, offset, length));
}
}
}
lastLeftWordID = dict.getLeftId(backID);
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE) {
System.out.println(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.freeBefore(endPos);
positions.freeBefore(endPos);
}