public List lookup()

in languagetool-core/src/main/java/org/languagetool/rules/spelling/symspell/implementation/SymSpell.java [313:503]


  public List<SuggestItem> lookup(String input, Verbosity verbosity, int maxEditDistance) {
    //verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found
    //verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency
    //verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination)

    // maxEditDistance used in lookup can't be bigger than the maxDictionaryEditDistance
    // used to construct the underlying dictionary structure.
    if (maxEditDistance > maxDictionaryEditDistance) {
      throw new IllegalArgumentException("Dist to big: " + maxEditDistance);
    }

    List<SuggestItem> suggestions = new ArrayList<>();
    int inputLen = input.length();

    // early exit - word is too big to possibly match any words
    if (inputLen - maxEditDistance > maxLength) {
      return suggestions;
    }

    // deletes we've considered already
    HashSet<String> consideredDeletes = new HashSet<>();
    // suggestions we've considered already
    HashSet<String> consideredSuggestions = new HashSet<>();
    long suggestionCount;

    // quick look for exact match
    if (words.containsKey(input)) {
      suggestionCount = words.get(input);
      suggestions.add(new SuggestItem(input, 0, suggestionCount));
      // early exit - return exact match, unless caller wants all matches
      if (verbosity != Verbosity.All) {
        return suggestions;
      }
    }
    consideredSuggestions.add(input); // input considered in above.

    int maxEditDistance2 = maxEditDistance;
    int candidatePointer = 0;
    List<String> candidates = new ArrayList<>();

    //add original prefix
    int inputPrefixLen = inputLen;
    if (inputPrefixLen > prefixLength) {
      inputPrefixLen = prefixLength;
      candidates.add(input.substring(0, inputPrefixLen));
    } else {
      candidates.add(input);
    }

    EditDistance distanceComparer = new EditDistance(input, this.distanceAlgorithm);
    while (candidatePointer < candidates.size()) {
      String candidate = candidates.get(candidatePointer++);
      int candidateLen = candidate.length();
      int lengthDiff = inputPrefixLen - candidateLen;

      //early termination if distance higher than suggestion distance
      if (lengthDiff > maxEditDistance2) {
        // skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest
        // (candidates are ordered by delete distance, so none are closer than current)
        if (verbosity == Verbosity.All) {
          continue;
        }
        break;
      }

      //read candidate entry from dictionary
      if (deletes.containsKey(getStringHash(candidate))) {
        String[] dictSuggestions = deletes.get(getStringHash(candidate));
        //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
        for (String suggestion : dictSuggestions) {
          if (suggestion.equals(input)) {
            continue;
          }
          int suggestionLen = suggestion.length();

          if ((Math.abs(suggestionLen - inputLen) > maxEditDistance2) // input/suggestion diff > allowed/current best distance
            || (suggestionLen < candidateLen) // sugg must be for a different delete String, in same bin only because of hash collision
            || (suggestionLen == candidateLen && !suggestion.equals(candidate))) // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision
          {
            continue;
          }

          int suggPrefixLen = Math.min(suggestionLen, prefixLength);
          if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2) {
            continue;
          }

          //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances > 0
          //We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term.
          //For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance.
          //For inserts and deletes the resulting edit distance might exceed maxEditDistance.
          //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
          //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1)
          //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
          int distance;
          int min = 0;
          if (candidateLen == 0) {
            //suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance)
            distance = Math.max(inputLen, suggestionLen);
            if (distance > maxEditDistance2 || !consideredSuggestions.add(suggestion)) {
              continue;
            }
          } else if (suggestionLen == 1) {
            if (input.indexOf(suggestion.charAt(0)) < 0) {
              distance = inputLen;
            } else {
              distance = inputLen - 1;
            }
            if (distance > maxEditDistance2 || !consideredSuggestions.add(suggestion)) {
              continue;
            }
          } else
            //number of edits in prefix == maxeditdistance  && no identical suffix
            //, then editdistance > maxEditDistance and no need for Levenshtein calculation
            //      (inputLen >= prefixLength) && (suggestionLen >= prefixLength)
            if ((prefixLength - maxEditDistance == candidateLen)
              && (((min = Math.min(inputLen, suggestionLen) - prefixLength) > 1)
              && !(input.substring(inputLen + 1 - min).equals(suggestion.substring(suggestionLen + 1 - min))))
              || ((min > 0) && (input.charAt(inputLen - min) != suggestion.charAt(suggestionLen - min))
              && ((input.charAt(inputLen - min - 1) != suggestion.charAt(suggestionLen - min))
              || (input.charAt(inputLen - min) != suggestion.charAt(suggestionLen - min - 1))))) {
              continue;
            } else {
              // deleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest.
              if ((verbosity != Verbosity.All && !deleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen))
                || !consideredSuggestions.add(suggestion)) {
                continue;
              }
              distance = distanceComparer.compare(suggestion, maxEditDistance2);
              if (distance < 0) {
                continue;
              }
            }

          //save some time
          //do not process higher distances than those already found, if verbosity<All (note: maxEditDistance2 will always equal maxEditDistance when Verbosity.All)
          if (distance <= maxEditDistance2) {
            suggestionCount = words.get(suggestion);
            SuggestItem si = new SuggestItem(suggestion, distance, suggestionCount);
            if (suggestions.size() > 0) {
              switch (verbosity) {
                case Closest:
                  //we will calculate DamLev distance only to the smallest found distance so far
                  if (distance < maxEditDistance2) {
                    suggestions.clear();
                  }
                  break;
                case Top:
                  if (distance < maxEditDistance2 || suggestionCount > suggestions.get(0).count) {
                    maxEditDistance2 = distance;
                    suggestions.set(0, si);
                  }
                  continue;
              }
            }
            if (verbosity != Verbosity.All) {
              maxEditDistance2 = distance;
            }
            suggestions.add(si);
          }
        }
      }

      //add edits
      //derive edits (deletes) from candidate (input) and add them to candidates list
      //this is a recursive process until the maximum edit distance has been reached
      if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength)) {
        //save some time
        //do not create edits with edit distance smaller than suggestions already found
        if (verbosity != Verbosity.All && lengthDiff >= maxEditDistance2) {
          continue;
        }

        for (int i = 0; i < candidateLen; i++) {
          StringBuilder sb = new StringBuilder(candidate);
          sb.deleteCharAt(i);
          String delete = sb.toString();

          if (consideredDeletes.add(delete)) {
            candidates.add(delete);
          }
        }
      }
    }

    //sort by ascending edit distance, then by descending word frequency
    if (suggestions.size() > 1) {
      Collections.sort(suggestions);
    }
    return suggestions;
  }