public List lookup()

in lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java [436:723]
174 lines of code
30 McCabe index (conditional complexity)

  public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num)
      throws IOException {
    if (contexts != null) {
      throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
      throw new IllegalStateException("Lookup not supported at this time");
    }

    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
      TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
      PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
      ts.reset();

      BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
      // System.out.println("lookup: key='" + key + "'");

      // Run full analysis, but save only the
      // last 1gram, last 2gram, etc.:
      int maxEndOffset = -1;
      boolean sawRealToken = false;
      while (ts.incrementToken()) {
        BytesRef tokenBytes = termBytesAtt.getBytesRef();
        sawRealToken |= tokenBytes.length > 0;
        // TODO: this is somewhat iffy; today, ShingleFilter
        // sets posLen to the gram count; maybe we should make
        // a separate dedicated att for this?
        int gramCount = posLenAtt.getPositionLength();

        assert gramCount <= grams;

        // Safety: make sure the recalculated count "agrees":
        if (countGrams(tokenBytes) != gramCount) {
          throw new IllegalArgumentException(
              "tokens must not contain separator byte; got token="
                  + tokenBytes
                  + " but gramCount="
                  + gramCount
                  + " does not match recalculated count="
                  + countGrams(tokenBytes));
        }
        maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
        BytesRefBuilder b = new BytesRefBuilder();
        b.append(tokenBytes);
        lastTokens[gramCount - 1] = b;
      }
      ts.end();

      if (!sawRealToken) {
        throw new IllegalArgumentException(
            "no tokens produced by analyzer, or the only tokens were empty strings");
      }

      // Carefully fill last tokens with _ tokens;
      // ShingleFilter appraently won't emit "only hole"
      // tokens:
      int endPosInc = posIncAtt.getPositionIncrement();

      // Note this will also be true if input is the empty
      // string (in which case we saw no tokens and
      // maxEndOffset is still -1), which in fact works out OK
      // because we fill the unigram with an empty BytesRef
      // below:
      boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
      // System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());

      if (lastTokenEnded) {
        // System.out.println("  lastTokenEnded");
        // If user hit space after the last token, then
        // "upgrade" all tokens.  This way "foo " will suggest
        // all bigrams starting w/ foo, and not any unigrams
        // starting with "foo":
        for (int i = grams - 1; i > 0; i--) {
          BytesRefBuilder token = lastTokens[i - 1];
          if (token == null) {
            continue;
          }
          token.append(separator);
          lastTokens[i] = token;
        }
        lastTokens[0] = new BytesRefBuilder();
      }

      Arc<Long> arc = new Arc<>();

      BytesReader bytesReader = fst.getBytesReader();

      // Try highest order models first, and if they return
      // results, return that; else, fallback:
      double backoff = 1.0;

      List<LookupResult> results = new ArrayList<>(num);

      // We only add a given suffix once, from the highest
      // order model that saw it; for subsequent lower order
      // models we skip it:
      final Set<BytesRef> seen = new HashSet<>();

      for (int gram = grams - 1; gram >= 0; gram--) {
        BytesRefBuilder token = lastTokens[gram];
        // Don't make unigram predictions from empty string:
        if (token == null || (token.length() == 0 && key.length() > 0)) {
          // Input didn't have enough tokens:
          // System.out.println("  gram=" + gram + ": skip: not enough input");
          continue;
        }

        if (endPosInc > 0 && gram <= endPosInc) {
          // Skip hole-only predictions; in theory we
          // shouldn't have to do this, but we'd need to fix
          // ShingleFilter to produce only-hole tokens:
          // System.out.println("  break: only holes now");
          break;
        }

        // System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());

        // TODO: we could add fuzziness here
        // match the prefix portion exactly
        // Pair<Long,BytesRef> prefixOutput = null;
        Long prefixOutput = null;
        try {
          prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
        // System.out.println("  prefixOutput=" + prefixOutput);

        if (prefixOutput == null) {
          // This model never saw this prefix, e.g. the
          // trigram model never saw context "purple mushroom"
          backoff *= ALPHA;
          continue;
        }

        // TODO: we could do this division at build time, and
        // bake it into the FST?

        // Denominator for computing scores from current
        // model's predictions:
        long contextCount = totTokens;

        BytesRef lastTokenFragment = null;

        for (int i = token.length() - 1; i >= 0; i--) {
          if (token.byteAt(i) == separator) {
            BytesRef context = new BytesRef(token.bytes(), 0, i);
            Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
            assert output != null;
            contextCount = decodeWeight(output);
            lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
            break;
          }
        }

        final BytesRefBuilder finalLastToken = new BytesRefBuilder();
        if (lastTokenFragment == null) {
          finalLastToken.copyBytes(token.get());
        } else {
          finalLastToken.copyBytes(lastTokenFragment);
        }

        CharsRefBuilder spare = new CharsRefBuilder();

        // complete top-N
        TopResults<Long> completions = null;
        try {

          // Because we store multiple models in one FST
          // (1gram, 2gram, 3gram), we must restrict the
          // search so that it only considers the current
          // model.  For highest order model, this is not
          // necessary since all completions in the FST
          // must be from this model, but for lower order
          // models we have to filter out the higher order
          // ones:

          // Must do num+seen.size() for queue depth because we may
          // reject up to seen.size() paths in acceptResult():
          Util.TopNSearcher<Long> searcher =
              new Util.TopNSearcher<>(fst, num, num + seen.size(), Comparator.naturalOrder()) {

                BytesRefBuilder scratchBytes = new BytesRefBuilder();

                @Override
                protected void addIfCompetitive(Util.FSTPath<Long> path) {
                  if (path.arc.label() != separator) {
                    // System.out.println("    keep path: " + Util.toBytesRef(path.input, new
                    // BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                    super.addIfCompetitive(path);
                  } else {
                    // System.out.println("    prevent path: " + Util.toBytesRef(path.input, new
                    // BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                  }
                }

                @Override
                protected boolean acceptResult(IntsRef input, Long output) {
                  Util.toBytesRef(input, scratchBytes);
                  finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
                  int lenSav = finalLastToken.length();
                  finalLastToken.append(scratchBytes);
                  // System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "';
                  // lastToken='" + finalLastToken.utf8ToString() + "'; return " +
                  // (seen.contains(finalLastToken) == false));
                  boolean ret = seen.contains(finalLastToken.get()) == false;

                  finalLastToken.setLength(lenSav);
                  return ret;
                }
              };

          // since this search is initialized with a single start node
          // it is okay to start with an empty input path here
          searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());

          completions = searcher.search();
          assert completions.isComplete;
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }

        int prefixLength = token.length();

        BytesRefBuilder suffix = new BytesRefBuilder();
        // System.out.println("    " + completions.length + " completions");

        nextCompletion:
        for (Result<Long> completion : completions) {
          token.setLength(prefixLength);
          // append suffix
          Util.toBytesRef(completion.input(), suffix);
          token.append(suffix);

          // System.out.println("    completion " + token.utf8ToString());

          // Skip this path if a higher-order model already
          // saw/predicted its last token:
          BytesRef lastToken = token.get();
          for (int i = token.length() - 1; i >= 0; i--) {
            if (token.byteAt(i) == separator) {
              assert token.length() - i - 1 > 0;
              lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
              break;
            }
          }
          if (seen.contains(lastToken)) {
            // System.out.println("      skip dup " + lastToken.utf8ToString());
            continue nextCompletion;
          }
          seen.add(BytesRef.deepCopyOf(lastToken));
          spare.copyUTF8Bytes(token.get());
          LookupResult result =
              new LookupResult(
                  spare.toString(),
                  (long)
                      (Long.MAX_VALUE
                          * backoff
                          * ((double) decodeWeight(completion.output()))
                          / contextCount));
          results.add(result);
          assert results.size() == seen.size();
          // System.out.println("  add result=" + result);
        }
        backoff *= ALPHA;
      }

      results.sort(
          (a, b) -> {
            if (a.value > b.value) {
              return -1;
            } else if (a.value < b.value) {
              return 1;
            } else {
              // Tie break by UTF16 sort order:
              return ((String) a.key).compareTo((String) b.key);
            }
          });

      if (results.size() > num) {
        results.subList(num, results.size()).clear();
      }

      return results;
    }
  }