in vsintegration/src/FSharp.LanguageService.Base/PatternMatcher/EditDistance.cs [181:556]
private static int GetEditDistanceWorker(ArraySlice<char> source, ArraySlice<char> target, int threshold)
{
// Note: sourceLength will always be smaller or equal to targetLength.
//
// Also Note: sourceLength and targetLength values will mutate and represent the lengths
// of the portions of the arrays we want to compare. However, even after mutation, hte
// invariant htat sourceLength is <= targetLength will remain.
Debug.Assert(source.Length <= target.Length);
// First:
// Determine the common prefix/suffix portions of the strings. We don't even need to
// consider them as they won't add anything to the edit cost.
while (source.Length > 0 && source[source.Length - 1] == target[target.Length - 1])
{
source.SetLength(source.Length - 1);
target.SetLength(target.Length - 1);
}
while (source.Length > 0 && source[0] == target[0])
{
source.MoveStartForward(amount: 1);
target.MoveStartForward(amount: 1);
}
// 'sourceLength' and 'targetLength' are now the lengths of the substrings of our strings that we
// want to compare. 'startIndex' is the starting point of the substrings in both array.
//
// If we've matched all of the 'source' string in the prefix and suffix of 'target'. then the edit
// distance is just whatever operations we have to create the remaining target substring.
//
// Note: we don't have to check if targetLength is 0. That's because targetLength being zero would
// necessarily mean that sourceLength is 0.
var sourceLength = source.Length;
var targetLength = target.Length;
if (sourceLength == 0)
{
return targetLength <= threshold ? targetLength : BeyondThreshold;
}
// The is the minimum number of edits we'd have to make. i.e. if 'source' and
// 'target' are the same length, then we might not need to make any edits. However,
// if target has length 10 and source has length 7, then we're going to have to
// make at least 3 edits no matter what.
var minimumEditCount = targetLength - sourceLength;
Debug.Assert(minimumEditCount >= 0);
// If the number of edits we'd have to perform is greater than our threshold, then
// there's no point in even continuing.
if (minimumEditCount > threshold)
{
return BeyondThreshold;
}
// Say we want to find the edit distance between "sunday" and "saturday". Our initial
// matrix will be:
//
// (Note: for purposes of this explanation we will not be trimming off the common
// prefix/suffix of the strings. That optimization does not affect any of the
// remainder of the explanation).
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3
// u |∞ 4
// r |∞ 5
// d |∞ 6
// a |∞ 7
// y |∞ 8
//
// Note that the matrix will always be square, or a rectangle that is taller htan it is
// longer. Our 'source' is at the top, and our 'target' is on the left. The edit distance
// between any prefix of 'source' and any prefix of 'target' can then be found in
// the unfilled area of the matrix. Specifically, if we have source.substring(0, m) and
// target.substring(0, n), then the edit distance for them can be found at matrix position
// (m+1, n+1). This is why the 1'th row and 1'th column can be prefilled. They represent
// the cost to go from the empty target to the full source or the empty source to the full
// target (respectively). So, if we wanted to know the edit distance between "sun" and
// "sat", we'd look at (3+1, 3+1). It then follows that our final edit distance between
// the full source and target is in the lower right corner of this matrix.
//
// If we fill out the matrix fully we'll get:
//
// s u n d a y <-- source
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 0 1 2 3 4 5
// a |∞ 2 1 1 2 3 3 4
// t |∞ 3 2 2 2 3 4 4
// u |∞ 4 3 2 3 3 4 5
// r |∞ 5 4 3 3 4 4 5
// d |∞ 6 5 4 4 3 4 5
// a |∞ 7 6 5 5 4 3 4
// y |∞ 8 7 6 6 5 4 3 <--
// ^
// |
//
// So in this case, the edit distance is 3. Or, specifically, the edits:
//
// Sunday -> Replace("n", "r") ->
// Surday -> Insert("a") ->
// Saurday -> Insert("t") ->
// Saturday
//
//
// Now: in the case where we want to know what the edit distance actually is (for example
// when making a BKTree), we must fill out this entire array to get the true edit distance.
//
// However, in some cases we can do a bit better. For example, if a client only wants to
// the edit distance *when the edit distance will be less than some threshold* then we do
// not need to examine the entire matrix. We only want to examine until the point where
// we realize that, no matter what, our final edit distance will be more than that threshold
// (at which point we can return early).
//
// Some things are trivially easy to check. First, the edit distance between two strings is at
// *best* the difference of their lengths. i.e. if i have "aa" and "aaaaa" then the edit
// distance is 3 (the difference of 5 and 2). If our threshold is less then 3 then there
// is no way these two strings could match. So we can leave early if we can tell it would
// simply be impossible to get an edit distance within the specified threshold.
//
// Second, let's look at our matrix again:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3
// u |∞ 4
// r |∞ 5
// d |∞ 6
// a |∞ 7
// y |∞ 8 *
//
// We want to know what the value is at *, and we want to stop as early as possible if it
// is greater than our threshold.
//
// Given the edit distance rules we observe edit distance at any point (i,j) in the matrix will
// always be greater than or equal to the value in (i-1, j-1). i.e. the edit distance of
// any two strings is going to be *at best* equal to the edit distance of those two strings
// without their final characters. If their final characters are the same, they'll ahve the
// same edit distance. If they are different, the edit distance will be greater. Given
// that we know the final edit distance is in the lower right, we can discover something
// useful in the matrix.
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 `
// r |∞ 5 `
// d |∞ 6 `
// a |∞ 7 `
// y |∞ 8 *
//
// The slashes are the "bottom" diagonal leading to the lower right. The value in the
// lower right will be strictly equal to or greater than any value on this diagonal.
// Thus, if that value exceeds the threshold, we know we can stop immediately as the
// total edit distance must be greater than the threshold.
//
// We can use similar logic to avoid even having to examine more of the matrix when we
// have a threshold. First, consider the same diagonal.
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 ` x
// r |∞ 5 ` |
// d |∞ 6 ` |
// a |∞ 7 ` |
// y |∞ 8 *
//
// And then consider a point above that diagonal (indicated by x). In the example
// above, the edit distance to * from 'x' will be (x+4). If, for example, threshold
// was '2', then it would be impossible for the path from 'x' to provide a good
// enough edit distance *ever*. Similarly:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1
// a |∞ 2
// t |∞ 3 `
// u |∞ 4 `
// r |∞ 5 `
// d |∞ 6 `
// a |∞ 7 `
// y |∞ 8 y - - *
//
// Here we see that the final edit distance will be "y+3". Again, if the edit
// distance threshold is less than 3, then no path from y will provide a good
// enough edit distance.
//
// So, if we had an edit distance threshold of 3, then the range around that
// bottom diagonal that we should consider checking is:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 | |
// a |∞ 2 | | |
// t |∞ 3 ` | | |
// u |∞ 4 - ` | | |
// r |∞ 5 - - ` | | |
// d |∞ 6 - - - ` | |
// a |∞ 7 - - - ` |
// y |∞ 8 - - - *
//
// Now, also consider that it will take a minimum of targetLength-sourceLength edits
// just to move to the lower diagonal from the upper diagonal. That leaves
// 'threshold - (targetLength - sourceLength)' edits remaining. In this example, that
// means '3 - (8 - 6)' = 1. Because of this our lower diagonal offset is capped at:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 | |
// a |∞ 2 | | |
// t |∞ 3 ` | | |
// u |∞ 4 - ` | | |
// r |∞ 5 - ` | | |
// d |∞ 6 - ` | |
// a |∞ 7 - ` |
// y |∞ 8 - *
//
// If we mark the upper diagonal appropriately we see the matrix as:
//
// s u n d a y
// ----------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6
// s |∞ 1 ` |
// a |∞ 2 ` |
// t |∞ 3 ` ` |
// u |∞ 4 - ` ` |
// r |∞ 5 - ` ` |
// d |∞ 6 - ` `
// a |∞ 7 - `
// y |∞ 8 - *
//
// Or, effectively, we only need to examine 'threshold - (targetLength - sourceLength)'
// above and below the diagonals.
//
// In practice, when a threshold is provided it is normally capped at '2'. Given that,
// the most around the diagonal we'll ever have to check is +/- 2 elements. i.e. with
// strings of length 10 we'd only check:
//
// a b c d e f g h i j
// ------------------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6 7 8 9 10
// m |∞ 1 * * *
// n |∞ 2 * * * *
// o |∞ 3 * * * * *
// p |∞ 4 * * * * *
// q |∞ 5 * * * * *
// r |∞ 6 * * * * *
// s |∞ 7 * * * * *
// t |∞ 8 * * * * *
// u |∞ 9 * * * *
// v |∞10 * * *
//
// or 10+18+16=44. Or only 44%. if our threshold is two and our strings differ by length
// 2 then we have:
//
// a b c d e f g h
// --------------------
// |∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞ ∞
// |∞ 0 1 2 3 4 5 6 7 8
// m |∞ 1 *
// n |∞ 2 * *
// o |∞ 3 * * *
// p |∞ 4 * * *
// q |∞ 5 * * *
// r |∞ 6 * * *
// s |∞ 7 * * *
// t |∞ 8 * * *
// u |∞ 9 * *
// v |∞10 *
//
// Then we examine 8+8+8=24 out of 80, or only 30% of the matrix. As the strings
// get larger, the savings increase as well.
// --------------------------------------------------------------------------------
// The highest cost it can be to convert a source to target is targetLength. i.e.
// changing all the characters in source to target (which would be be 'sourceLength'
// changes), and then adding all the missing characters in 'target' (which is
// 'targetLength' - 'sourceLength' changes). Combined that's 'targetLength'.
//
// So we can just cap our threshold here. This makes some of the walking code
// below simpler.
threshold = Math.Min(threshold, targetLength);
var offset = threshold - minimumEditCount;
Debug.Assert(offset >= 0);
var matrix = GetMatrix(sourceLength + 2, targetLength + 2);
var characterToLastSeenIndex_inSource = t_dictionaryPool.Value;
characterToLastSeenIndex_inSource.Clear();
for (int i = 1; i <= sourceLength; i++)
{
var lastMatchIndex_inTarget = 0;
var sourceChar = source[i - 1];
// Determinethe portion of the column we actually want to examine.
var jStart = Math.Max(1, i - offset);
var jEnd = Math.Min(targetLength, i + minimumEditCount + offset);
// If we're examining only a subportion of the column, then we need to make sure
// that the values outside that range are set to Infinity. That way we don't
// consider them when we look through edit paths from above (for this column) or
// from the left (for the next column).
if (jStart > 1)
{
matrix[i + 1, jStart] = Infinity;
}
if (jEnd < targetLength)
{
matrix[i + 1, jEnd + 2] = Infinity;
}
for (int j = jStart; j <= jEnd; j++)
{
var targetChar = target[j - 1];
var i1 = GetValue(characterToLastSeenIndex_inSource, targetChar);
var j1 = lastMatchIndex_inTarget;
var matched = sourceChar == targetChar;
if (matched)
{
lastMatchIndex_inTarget = j;
}
matrix[i + 1, j + 1] = Min(
matrix[i, j] + (matched ? 0 : 1),
matrix[i + 1, j] + 1,
matrix[i, j + 1] + 1,
matrix[i1, j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
}
characterToLastSeenIndex_inSource[sourceChar] = i;
// Recall that minimumEditCount is simply the difference in length of our two
// strings. So matrix[i+1,i+1] is the cost for the upper-left diagonal of the
// matrix. matrix[i+1,i+1+minimumEditCount] is the cost for the lower right diagonal.
// Here we are simply getting the lowest cost edit of hese two substrings so far.
// If this lowest cost edit is greater than our threshold, then there is no need
// to proceed.
if (matrix[i + 1, i + minimumEditCount + 1] > threshold)
{
return BeyondThreshold;
}
}
return matrix[sourceLength + 1, targetLength + 1];
}