in languagetool-core/src/main/java/org/languagetool/rules/spelling/symspell/implementation/SymSpell.java [313:503]
public List<SuggestItem> lookup(String input, Verbosity verbosity, int maxEditDistance) {
//verbosity=Top: the suggestion with the highest term frequency of the suggestions of smallest edit distance found
//verbosity=Closest: all suggestions of smallest edit distance found, the suggestions are ordered by term frequency
//verbosity=All: all suggestions <= maxEditDistance, the suggestions are ordered by edit distance, then by term frequency (slower, no early termination)
// maxEditDistance used in lookup can't be bigger than the maxDictionaryEditDistance
// used to construct the underlying dictionary structure.
if (maxEditDistance > maxDictionaryEditDistance) {
throw new IllegalArgumentException("Dist to big: " + maxEditDistance);
}
List<SuggestItem> suggestions = new ArrayList<>();
int inputLen = input.length();
// early exit - word is too big to possibly match any words
if (inputLen - maxEditDistance > maxLength) {
return suggestions;
}
// deletes we've considered already
HashSet<String> consideredDeletes = new HashSet<>();
// suggestions we've considered already
HashSet<String> consideredSuggestions = new HashSet<>();
long suggestionCount;
// quick look for exact match
if (words.containsKey(input)) {
suggestionCount = words.get(input);
suggestions.add(new SuggestItem(input, 0, suggestionCount));
// early exit - return exact match, unless caller wants all matches
if (verbosity != Verbosity.All) {
return suggestions;
}
}
consideredSuggestions.add(input); // input considered in above.
int maxEditDistance2 = maxEditDistance;
int candidatePointer = 0;
List<String> candidates = new ArrayList<>();
//add original prefix
int inputPrefixLen = inputLen;
if (inputPrefixLen > prefixLength) {
inputPrefixLen = prefixLength;
candidates.add(input.substring(0, inputPrefixLen));
} else {
candidates.add(input);
}
EditDistance distanceComparer = new EditDistance(input, this.distanceAlgorithm);
while (candidatePointer < candidates.size()) {
String candidate = candidates.get(candidatePointer++);
int candidateLen = candidate.length();
int lengthDiff = inputPrefixLen - candidateLen;
//early termination if distance higher than suggestion distance
if (lengthDiff > maxEditDistance2) {
// skip to next candidate if Verbosity.All, look no further if Verbosity.Top or Closest
// (candidates are ordered by delete distance, so none are closer than current)
if (verbosity == Verbosity.All) {
continue;
}
break;
}
//read candidate entry from dictionary
if (deletes.containsKey(getStringHash(candidate))) {
String[] dictSuggestions = deletes.get(getStringHash(candidate));
//iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
for (String suggestion : dictSuggestions) {
if (suggestion.equals(input)) {
continue;
}
int suggestionLen = suggestion.length();
if ((Math.abs(suggestionLen - inputLen) > maxEditDistance2) // input/suggestion diff > allowed/current best distance
|| (suggestionLen < candidateLen) // sugg must be for a different delete String, in same bin only because of hash collision
|| (suggestionLen == candidateLen && !suggestion.equals(candidate))) // if sugg len = delete len, then it either equals delete or is in same bin only because of hash collision
{
continue;
}
int suggPrefixLen = Math.min(suggestionLen, prefixLength);
if (suggPrefixLen > inputPrefixLen && (suggPrefixLen - candidateLen) > maxEditDistance2) {
continue;
}
//True Damerau-Levenshtein Edit Distance: adjust distance, if both distances > 0
//We allow simultaneous edits (deletes) of maxEditDistance on on both the dictionary and the input term.
//For replaces and adjacent transposes the resulting edit distance stays <= maxEditDistance.
//For inserts and deletes the resulting edit distance might exceed maxEditDistance.
//To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
//Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for maxEditDistance=1)
//Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
int distance;
int min = 0;
if (candidateLen == 0) {
//suggestions which have no common chars with input (inputLen<=maxEditDistance && suggestionLen<=maxEditDistance)
distance = Math.max(inputLen, suggestionLen);
if (distance > maxEditDistance2 || !consideredSuggestions.add(suggestion)) {
continue;
}
} else if (suggestionLen == 1) {
if (input.indexOf(suggestion.charAt(0)) < 0) {
distance = inputLen;
} else {
distance = inputLen - 1;
}
if (distance > maxEditDistance2 || !consideredSuggestions.add(suggestion)) {
continue;
}
} else
//number of edits in prefix == maxeditdistance && no identical suffix
//, then editdistance > maxEditDistance and no need for Levenshtein calculation
// (inputLen >= prefixLength) && (suggestionLen >= prefixLength)
if ((prefixLength - maxEditDistance == candidateLen)
&& (((min = Math.min(inputLen, suggestionLen) - prefixLength) > 1)
&& !(input.substring(inputLen + 1 - min).equals(suggestion.substring(suggestionLen + 1 - min))))
|| ((min > 0) && (input.charAt(inputLen - min) != suggestion.charAt(suggestionLen - min))
&& ((input.charAt(inputLen - min - 1) != suggestion.charAt(suggestionLen - min))
|| (input.charAt(inputLen - min) != suggestion.charAt(suggestionLen - min - 1))))) {
continue;
} else {
// deleteInSuggestionPrefix is somewhat expensive, and only pays off when verbosity is Top or Closest.
if ((verbosity != Verbosity.All && !deleteInSuggestionPrefix(candidate, candidateLen, suggestion, suggestionLen))
|| !consideredSuggestions.add(suggestion)) {
continue;
}
distance = distanceComparer.compare(suggestion, maxEditDistance2);
if (distance < 0) {
continue;
}
}
//save some time
//do not process higher distances than those already found, if verbosity<All (note: maxEditDistance2 will always equal maxEditDistance when Verbosity.All)
if (distance <= maxEditDistance2) {
suggestionCount = words.get(suggestion);
SuggestItem si = new SuggestItem(suggestion, distance, suggestionCount);
if (suggestions.size() > 0) {
switch (verbosity) {
case Closest:
//we will calculate DamLev distance only to the smallest found distance so far
if (distance < maxEditDistance2) {
suggestions.clear();
}
break;
case Top:
if (distance < maxEditDistance2 || suggestionCount > suggestions.get(0).count) {
maxEditDistance2 = distance;
suggestions.set(0, si);
}
continue;
}
}
if (verbosity != Verbosity.All) {
maxEditDistance2 = distance;
}
suggestions.add(si);
}
}
}
//add edits
//derive edits (deletes) from candidate (input) and add them to candidates list
//this is a recursive process until the maximum edit distance has been reached
if ((lengthDiff < maxEditDistance) && (candidateLen <= prefixLength)) {
//save some time
//do not create edits with edit distance smaller than suggestions already found
if (verbosity != Verbosity.All && lengthDiff >= maxEditDistance2) {
continue;
}
for (int i = 0; i < candidateLen; i++) {
StringBuilder sb = new StringBuilder(candidate);
sb.deleteCharAt(i);
String delete = sb.toString();
if (consideredDeletes.add(delete)) {
candidates.add(delete);
}
}
}
}
//sort by ascending edit distance, then by descending word frequency
if (suggestions.size() > 1) {
Collections.sort(suggestions);
}
return suggestions;
}