protected void backtrace()

in lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/Viterbi.java [155:341]


  protected void backtrace(Position endPosData, int fromIDX) {
    final int endPos = endPosData.getPos();

    if (endPos == lastBackTracePos) {
      return;
    }

    if (VERBOSE) {
      System.out.println(
          "\n  backtrace: endPos="
              + endPos
              + " pos="
              + pos
              + "; "
              + (pos - lastBackTracePos)
              + " characters; last="
              + lastBackTracePos
              + " cost="
              + endPosData.getCost(fromIDX));
    }

    final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos);

    if (dotOut != null) {
      dotOut.onBacktrace(
          this::getDict, positions, lastBackTracePos, endPosData, fromIDX, fragment, end);
    }

    int pos = endPos;
    int bestIDX = fromIDX;

    // TODO: sort of silly to make Token instances here; the
    // back trace has all info needed to generate the
    // token.  So, we could just directly set the attrs,
    // from the backtrace, in incrementToken w/o ever
    // creating Token; we'd have to defer calling freeBefore
    // until after the backtrace was fully "consumed" by
    // incrementToken.

    while (pos > lastBackTracePos) {
      // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX);
      final Position posData = positions.get(pos);
      assert bestIDX < posData.getCount();

      int backPos = posData.getBackPos(bestIDX);
      int backWordPos = posData.getBackWordPos(bestIDX);
      assert backPos >= lastBackTracePos
          : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos;
      // the length of the word without the whitespaces at the beginning.
      int length = pos - backWordPos;
      TokenType backType = posData.getBackType(bestIDX);
      int backID = posData.getBackID(bestIDX);
      int nextBestIDX = posData.getBackIndex(bestIDX);
      // the start of the word after the whitespace at the beginning.
      final int fragmentOffset = backWordPos - lastBackTracePos;
      assert fragmentOffset >= 0;

      final Dictionary<? extends KoMorphData> dict = getDict(backType);

      if (outputUnknownUnigrams && backType == TokenType.UNKNOWN) {
        // outputUnknownUnigrams converts unknown word into unigrams:
        for (int i = length - 1; i >= 0; i--) {
          int charLen = 1;
          if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) {
            i--;
            charLen = 2;
          }
          final DictionaryToken token =
              new DictionaryToken(
                  TokenType.UNKNOWN,
                  unkDictionary.getMorphAttributes(),
                  CharacterDefinition.NGRAM,
                  fragment,
                  fragmentOffset + i,
                  charLen,
                  backWordPos + i,
                  backWordPos + i + charLen);
          pending.add(token);
          if (VERBOSE) {
            System.out.println("    add token=" + pending.get(pending.size() - 1));
          }
        }
      } else {
        final DictionaryToken token =
            new DictionaryToken(
                backType,
                dict.getMorphAttributes(),
                backID,
                fragment,
                fragmentOffset,
                length,
                backWordPos,
                backWordPos + length);
        if (token.getPOSType() == POS.Type.MORPHEME
            || mode == KoreanTokenizer.DecompoundMode.NONE) {
          if (shouldFilterToken(token) == false) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          }
        } else {
          KoMorphData.Morpheme[] morphemes = token.getMorphemes();
          if (morphemes == null) {
            pending.add(token);
            if (VERBOSE) {
              System.out.println("    add token=" + pending.get(pending.size() - 1));
            }
          } else {
            int endOffset = backWordPos + length;
            int posLen = 0;
            // decompose the compound
            for (int i = morphemes.length - 1; i >= 0; i--) {
              final KoMorphData.Morpheme morpheme = morphemes[i];
              final Token compoundToken;
              if (token.getPOSType() == POS.Type.COMPOUND) {
                assert endOffset - morpheme.surfaceForm().length() >= 0;
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag(),
                        morpheme.surfaceForm(),
                        endOffset - morpheme.surfaceForm().length(),
                        endOffset,
                        backType);
              } else {
                compoundToken =
                    new DecompoundToken(
                        morpheme.posTag(),
                        morpheme.surfaceForm(),
                        token.getStartOffset(),
                        token.getEndOffset(),
                        backType);
              }
              if (i == 0 && mode == KoreanTokenizer.DecompoundMode.MIXED) {
                compoundToken.setPositionIncrement(0);
              }
              ++posLen;
              endOffset -= morpheme.surfaceForm().length();
              pending.add(compoundToken);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
            if (mode == KoreanTokenizer.DecompoundMode.MIXED) {
              token.setPositionLength(Math.max(1, posLen));
              pending.add(token);
              if (VERBOSE) {
                System.out.println("    add token=" + pending.get(pending.size() - 1));
              }
            }
          }
        }
      }
      if (discardPunctuation == false && backWordPos != backPos) {
        // Add a token for whitespaces between terms
        int offset = backPos - lastBackTracePos;
        int len = backWordPos - backPos;
        // System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " +
        // backPos);
        unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef);
        DictionaryToken spaceToken =
            new DictionaryToken(
                TokenType.UNKNOWN,
                unkDictionary.getMorphAttributes(),
                wordIdRef.ints[wordIdRef.offset],
                fragment,
                offset,
                len,
                backPos,
                backPos + len);
        pending.add(spaceToken);
      }

      pos = backPos;
      bestIDX = nextBestIDX;
    }

    lastBackTracePos = endPos;

    if (VERBOSE) {
      System.out.println("  freeBefore pos=" + endPos);
    }
    // Notify the circular buffers that we are done with
    // these positions:
    buffer.freeBefore(endPos);
    positions.freeBefore(endPos);
  }