in lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java [155:341]
protected void backtrace(Position endPosData, int fromIDX) {
final int endPos = endPosData.getPos();
if (endPos == lastBackTracePos) {
return;
}
if (VERBOSE) {
System.out.println(
"\n backtrace: endPos="
+ endPos
+ " pos="
+ pos
+ "; "
+ (pos - lastBackTracePos)
+ " characters; last="
+ lastBackTracePos
+ " cost="
+ endPosData.getCost(fromIDX));
}
final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);
if (dotOut != null) {
dotOut.onBacktrace(
this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
}
int pos = endPos;
int bestIDX = fromIDX;
// TODO: sort of silly to make Token instances here; the
// back trace has all info needed to generate the
// token. So, we could just directly set the attrs,
// from the backtrace, in incrementToken w/o ever
// creating Token; we'd have to defer calling freeBefore
// until after the backtrace was fully "consumed" by
// incrementToken.
while (pos > lastBackTracePos) {
// System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
final Position posData = positions.get(pos);
assert bestIDX < posData.getCount();
int backPos = posData.getBackPos(bestIDX);
int backWordPos = posData.getBackWordPos(bestIDX);
assert backPos >= lastBackTracePos
: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
// the length of the word without the whitespaces at the beginning.
int length = pos - backWordPos;
TokenType backType = posData.getBackType(bestIDX);
int backID = posData.getBackID(bestIDX);
int nextBestIDX = posData.getBackIndex(bestIDX);
// the start of the word after the whitespace at the beginning.
final int fragmentOffset = backWordPos - lastBackTracePos;
assert fragmentOffset >= 0;
final Dictionary<? extends KoMorphData> dict = getDict(backType);
if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
// outputUnknownUnigrams converts unknown word into unigrams:
for (int i = length - 1; i >= 0; i--) {
int charLen = 1;
if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
i--;
charLen = 2;
}
final DictionaryToken token =
new DictionaryToken(
TokenType.UNKNOWN,
unkDictionary.getMorphAttributes(),
CharacterDefinition.NGRAM,
fragment,
fragmentOffset + i,
charLen,
backWordPos + i,
backWordPos + i + charLen);
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
} else {
final DictionaryToken token =
new DictionaryToken(
backType,
dict.getMorphAttributes(),
backID,
fragment,
fragmentOffset,
length,
backWordPos,
backWordPos + length);
if (token.getPOSType() == POS.Type.MORPHEME
|| mode == KoreanTokenizer.DecompoundMode.NONE) {
if (shouldFilterToken(token) == false) {
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
} else {
KoMorphData.Morpheme[] morphemes = token.getMorphemes();
if (morphemes == null) {
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
} else {
int endOffset = backWordPos + length;
int posLen = 0;
// decompose the compound
for (int i = morphemes.length - 1; i >= 0; i--) {
final KoMorphData.Morpheme morpheme = morphemes[i];
final Token compoundToken;
if (token.getPOSType() == POS.Type.COMPOUND) {
assert endOffset - morpheme.surfaceForm.length() >= 0;
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
endOffset - morpheme.surfaceForm.length(),
endOffset,
backType);
} else {
compoundToken =
new DecompoundToken(
morpheme.posTag,
morpheme.surfaceForm,
token.getStartOffset(),
token.getEndOffset(),
backType);
}
if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
compoundToken.setPositionIncrement(0);
}
++posLen;
endOffset -= morpheme.surfaceForm.length();
pending.add(compoundToken);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
token.setPositionLength(Math.max(1, posLen));
pending.add(token);
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size() - 1));
}
}
}
}
}
if (discardPunctuation == false && backWordPos != backPos) {
// Add a token for whitespaces between terms
int offset = backPos - lastBackTracePos;
int len = backWordPos - backPos;
// System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
// backPos);
unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
DictionaryToken spaceToken =
new DictionaryToken(
TokenType.UNKNOWN,
unkDictionary.getMorphAttributes(),
wordIdRef.ints[wordIdRef.offset],
fragment,
offset,
len,
backPos,
backPos + len);
pending.add(spaceToken);
}
pos = backPos;
bestIDX = nextBestIDX;
}
lastBackTracePos = endPos;
if (VERBOSE) {
System.out.println(" freeBefore pos=" + endPos);
}
// Notify the circular buffers that we are done with
// these positions:
buffer.freeBefore(endPos);
positions.freeBefore(endPos);
}