in lucene/analysis/common/src/java/org/apache/lucene/analysis/morph/Viterbi.java [104:409]
public final void forward() throws IOException {
if (VERBOSE) {
System.out.println("\nPARSE");
}
// Index of the last character of unknown word:
int unknownWordEndIndex = -1;
// Maximum posAhead of user word in the entire input
int userWordMaxPosAhead = -1;
// Advances over each position (character):
while (buffer.get(pos) != -1) {
final Position posData = positions.get(pos);
final boolean isFrontier = positions.getNextPos() == pos + 1;
if (posData.count == 0) {
// No arcs arrive here; move to next position:
if (VERBOSE) {
System.out.println(" no arcs in; skip pos=" + pos);
}
pos++;
continue;
}
if (pos > lastBackTracePos && posData.count == 1 && isFrontier) {
// We are at a "frontier", and only one node is
// alive, so whatever the eventual best path is must
// come through this node. So we can safely commit
// to the prefix of the best path at this point:
if (outputNBest) {
backtraceNBest(posData, false);
}
backtrace(posData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
posData.costs[0] = 0;
if (pending.size() > 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
}
}
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total best path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost partial path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
Position leastPosData = null;
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
for (int idx = 0; idx < posData2.count; idx++) {
// System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData2.costs[idx];
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
leastPosData = posData2;
}
}
}
// We will always have at least one live path:
assert leastIDX != -1;
if (outputNBest) {
backtraceNBest(leastPosData, false);
}
// Second pass: prune all but the best path:
for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) {
final Position posData2 = positions.get(pos2);
if (posData2 != leastPosData) {
posData2.reset();
} else {
if (leastIDX != 0) {
posData2.costs[0] = posData2.costs[leastIDX];
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
posData2.backPos[0] = posData2.backPos[leastIDX];
posData2.backWordPos[0] = posData2.backWordPos[leastIDX];
posData2.backIndex[0] = posData2.backIndex[leastIDX];
posData2.backID[0] = posData2.backID[leastIDX];
posData2.backType[0] = posData2.backType[leastIDX];
}
posData2.count = 1;
}
}
backtrace(leastPosData, 0);
if (outputNBest) {
fixupPendingList();
}
// Re-base cost so we don't risk int overflow:
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pos != leastPosData.pos) {
// We jumped into a future position:
assert pos < leastPosData.pos;
pos = leastPosData.pos;
}
if (pending.size() > 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
continue;
}
}
if (VERBOSE) {
System.out.println(
"\n extend @ pos="
+ pos
+ " char="
+ (char) buffer.get(pos)
+ " hex="
+ Integer.toHexString(buffer.get(pos)));
}
if (VERBOSE) {
System.out.println(" " + posData.count + " arcs in");
}
if (enableSpacePenaltyFactor
&& Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) {
// We add single space separator as prefixes of the terms that we extract.
// This information is needed to compute the space penalty factor of each term.
// These whitespace prefixes are removed when the final tokens are generated, or
// added as separated tokens when discardPunctuation is unset.
if (buffer.get(++pos) == -1) {
pos = posData.pos;
}
}
boolean anyMatches = false;
// First try user dict:
if (userFST != null) {
userFST.getFirstArc(arc);
int output = 0;
int maxPosAhead = 0;
int outputMaxPosAhead = 0;
int arcFinalOutMaxPosAhead = 0;
for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) {
break;
}
output += arc.output().intValue();
if (arc.isFinal()) {
maxPosAhead = posAhead;
outputMaxPosAhead = output;
arcFinalOutMaxPosAhead = arc.nextFinalOutput().intValue();
anyMatches = true;
if (!outputLongestUserEntryOnly) {
// add all matched user entries.
add(
userDictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
output + arc.nextFinalOutput().intValue(),
TokenType.USER,
false);
}
}
}
// Longest matching for user word
if (anyMatches && maxPosAhead > userWordMaxPosAhead) {
if (outputLongestUserEntryOnly) {
if (VERBOSE) {
System.out.println(
" USER word "
+ new String(buffer.get(pos, maxPosAhead + 1))
+ " toPos="
+ (maxPosAhead + 1));
}
add(
userDictionary.getMorphAttributes(),
posData,
pos,
maxPosAhead + 1,
outputMaxPosAhead + arcFinalOutMaxPosAhead,
TokenType.USER,
false);
}
userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead);
}
}
// TODO: we can be more aggressive about user
// matches? if we are "under" a user match then don't
// extend KNOWN/UNKNOWN paths?
if (!anyMatches) {
// Next, try known dictionary matches
fst.getFirstArc(arc);
int output = 0;
for (int posAhead = pos; ; posAhead++) {
final int ch = buffer.get(posAhead);
if (ch == -1) {
break;
}
// System.out.println(" match " + (char) ch + " posAhead=" + posAhead);
if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) {
break;
}
output += arc.output().intValue();
// Optimization: for known words that are too-long
// (compound), we should pre-compute the 2nd
// best segmentation and store it in the
// dictionary instead of recomputing it each time a
// match is found.
if (arc.isFinal()) {
dictionary.lookupWordIds(output + arc.nextFinalOutput().intValue(), wordIdRef);
if (VERBOSE) {
System.out.println(
" KNOWN word "
+ new String(buffer.get(pos, posAhead - pos + 1))
+ " toPos="
+ (posAhead + 1)
+ " "
+ wordIdRef.length
+ " wordIDs");
}
for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
add(
dictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
wordIdRef.ints[wordIdRef.offset + ofs],
TokenType.KNOWN,
false);
anyMatches = true;
}
}
}
}
if (!shouldSkipProcessUnknownWord(unknownWordEndIndex, posData)) {
int unknownWordLength = processUnknownWord(anyMatches, posData);
unknownWordEndIndex = posData.pos + unknownWordLength;
}
pos++;
}
end = true;
if (pos > 0) {
final Position endPosData = positions.get(pos);
int leastCost = Integer.MAX_VALUE;
int leastIDX = -1;
if (VERBOSE) {
System.out.println(" end: " + endPosData.count + " nodes");
}
for (int idx = 0; idx < endPosData.count; idx++) {
// Add EOS cost:
final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0);
// System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" +
// endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ")
// backPos=" + endPosData.backPos[idx]);
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
}
}
if (outputNBest) {
backtraceNBest(endPosData, true);
}
backtrace(endPosData, leastIDX);
if (outputNBest) {
fixupPendingList();
}
} else {
// No characters in the input string; return no tokens!
}
}