cli/levenshtein.go (158 lines of code) (raw):
//
// Copyright (c) 2013 Kilian Evang and contributors
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// https://github.com/texttheater/golang-levenshtein
//
// This package implements the Levenshtein algorithm for computing the
// similarity between two strings. The central function is MatrixForStrings,
// which computes the Levenshtein matrix. The functions DistanceForMatrix,
// EditScriptForMatrix and RatioForMatrix read various interesting properties
// off the matrix. The package also provides the convenience functions
// DistanceForStrings, EditScriptForStrings and RatioForStrings for going
// directly from two strings to the property of interest.
//
package cli
import (
"fmt"
"io"
"os"
)
type EditOperation int
const (
Ins = iota
Del
Sub
Match
)
type EditScript []EditOperation
type MatchFunction func(rune, rune) bool
type Options struct {
InsCost int
DelCost int
SubCost int
Matches MatchFunction
}
// DefaultOptions is the default options: insertion cost is 1, deletion cost is
// 1, substitution cost is 2, and two runes match iff they are the same.
var DefaultOptions Options = Options{
InsCost: 1,
DelCost: 1,
SubCost: 2,
Matches: func(sourceCharacter rune, targetCharacter rune) bool {
return sourceCharacter == targetCharacter
},
}
func (operation EditOperation) String() string {
if operation == Match {
return "match"
} else if operation == Ins {
return "ins"
} else if operation == Sub {
return "sub"
}
return "del"
}
// DistanceForStrings returns the edit distance between source and target.
//
// It has a runtime proportional to len(source) * len(target) and memory use
// proportional to len(target).
func DistanceForStrings(source []rune, target []rune, op Options) int {
// Note: This algorithm is a specialization of MatrixForStrings.
// MatrixForStrings returns the full edit matrix. However, we only need a
// single value (see DistanceForMatrix) and the main loop of the algorithm
// only uses the current and previous row. As such we create a 2D matrix,
// but with height 2 (enough to store current and previous row).
height := len(source) + 1
width := len(target) + 1
matrix := make([][]int, 2)
// Initialize trivial distances (from/to empty string). That is, fill
// the left column and the top row with row/column indices.
for i := 0; i < 2; i++ {
matrix[i] = make([]int, width)
matrix[i][0] = i
}
for j := 1; j < width; j++ {
matrix[0][j] = j
}
// Fill in the remaining cells: for each prefix pair, choose the
// (edit history, operation) pair with the lowest cost.
for i := 1; i < height; i++ {
cur := matrix[i%2]
prev := matrix[(i-1)%2]
cur[0] = i
for j := 1; j < width; j++ {
delCost := prev[j] + op.DelCost
matchSubCost := prev[j-1]
if !op.Matches(source[i-1], target[j-1]) {
matchSubCost += op.SubCost
}
insCost := cur[j-1] + op.InsCost
cur[j] = min(delCost, min(matchSubCost, insCost))
}
}
return matrix[(height-1)%2][width-1]
}
// DistanceForMatrix reads the edit distance off the given Levenshtein matrix.
func DistanceForMatrix(matrix [][]int) int {
return matrix[len(matrix)-1][len(matrix[0])-1]
}
// RatioForStrings returns the Levenshtein ratio for the given strings. The
// ratio is computed as follows:
//
// (sourceLength + targetLength - distance) / (sourceLength + targetLength)
func RatioForStrings(source []rune, target []rune, op Options) float64 {
matrix := MatrixForStrings(source, target, op)
return RatioForMatrix(matrix)
}
// RatioForMatrix returns the Levenshtein ratio for the given matrix. The ratio
// is computed as follows:
//
// (sourceLength + targetLength - distance) / (sourceLength + targetLength)
func RatioForMatrix(matrix [][]int) float64 {
sourcelength := len(matrix) - 1
targetlength := len(matrix[0]) - 1
sum := sourcelength + targetlength
if sum == 0 {
return 0
}
dist := DistanceForMatrix(matrix)
return float64(sum-dist) / float64(sum)
}
// MatrixForStrings generates a 2-D array representing the dynamic programming
// table used by the Levenshtein algorithm, as described e.g. here:
// http://www.let.rug.nl/kleiweg/lev/
// The reason for putting the creation of the table into a separate function is
// that it cannot only be used for reading of the edit distance between two
// strings, but also e.g. to backtrace an edit script that provides an
// alignment between the characters of both strings.
func MatrixForStrings(source []rune, target []rune, op Options) [][]int {
// Make a 2-D matrix. Rows correspond to prefixes of source, columns to
// prefixes of target. Cells will contain edit distances.
// Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html
height := len(source) + 1
width := len(target) + 1
matrix := make([][]int, height)
// Initialize trivial distances (from/to empty string). That is, fill
// the left column and the top row with row/column indices.
for i := 0; i < height; i++ {
matrix[i] = make([]int, width)
matrix[i][0] = i
}
for j := 1; j < width; j++ {
matrix[0][j] = j
}
// Fill in the remaining cells: for each prefix pair, choose the
// (edit history, operation) pair with the lowest cost.
for i := 1; i < height; i++ {
for j := 1; j < width; j++ {
delCost := matrix[i-1][j] + op.DelCost
matchSubCost := matrix[i-1][j-1]
if !op.Matches(source[i-1], target[j-1]) {
matchSubCost += op.SubCost
}
insCost := matrix[i][j-1] + op.InsCost
matrix[i][j] = min(delCost, min(matchSubCost,
insCost))
}
}
//LogMatrix(source, target, matrix)
return matrix
}
// EditScriptForStrings returns an optimal edit script to turn source into
// target.
func EditScriptForStrings(source []rune, target []rune, op Options) EditScript {
return backtrace(len(source), len(target),
MatrixForStrings(source, target, op), op)
}
// EditScriptForMatrix returns an optimal edit script based on the given
// Levenshtein matrix.
func EditScriptForMatrix(matrix [][]int, op Options) EditScript {
return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op)
}
// WriteMatrix writes a visual representation of the given matrix for the given
// strings to the given writer.
func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) {
fmt.Fprintf(writer, " ")
for _, targetRune := range target {
fmt.Fprintf(writer, " %c", targetRune)
}
fmt.Fprintf(writer, "\n")
fmt.Fprintf(writer, " %2d", matrix[0][0])
for j := range target {
fmt.Fprintf(writer, " %2d", matrix[0][j+1])
}
fmt.Fprintf(writer, "\n")
for i, sourceRune := range source {
fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0])
for j := range target {
fmt.Fprintf(writer, " %2d", matrix[i+1][j+1])
}
fmt.Fprintf(writer, "\n")
}
}
// LogMatrix writes a visual representation of the given matrix for the given
// strings to os.Stderr. This function is deprecated, use
// WriteMatrix(source, target, matrix, os.Stderr) instead.
func LogMatrix(source []rune, target []rune, matrix [][]int) {
WriteMatrix(source, target, matrix, os.Stderr)
}
func backtrace(i int, j int, matrix [][]int, op Options) EditScript {
if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] {
return append(backtrace(i-1, j, matrix, op), Del)
}
if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] {
return append(backtrace(i, j-1, matrix, op), Ins)
}
if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] {
return append(backtrace(i-1, j-1, matrix, op), Sub)
}
if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] {
return append(backtrace(i-1, j-1, matrix, op), Match)
}
return []EditOperation{}
}
func min(a int, b int) int {
if b < a {
return b
}
return a
}