- # --------------------------------------------------------------------
- # function: Return the Levenshtein distance (also called Edit distance)
- # between two strings
- #
- # The Levenshtein distance (LD) is a measure of similarity between two
- # strings, denoted here by s1 and s2. The distance is the number of
- # deletions, insertions or substitutions required to transform s1 into
- # s2. The greater the distance, the more different the strings are.
- #
- # The algorithm employs a proximity matrix, which denotes the distances
- # between substrings of the two given strings. Read the embedded comments
- # for more info. If you want a deep understanding of the algorithm, print
- # the matrix for some test strings and study it
- #
- # The beauty of this system is that nothing is magical -
- # the distance is intuitively understandable by humans
- #
- # The distance is named after the Russian scientist Vladimir Levenshtein,
- # who devised the algorithm in 1965
- # --------------------------------------------------------------------
- sub levenshtein
- {
- my ($s1, $s2) = @_;
- my ($len1, $len2) = (length $s1, length $s2);
- # If one of the strings is empty,
- # the distance is the length of the other string
- return $len2 if ($len1 == 0);
- return $len1 if ($len2 == 0);
- my %mat;
- # Init the distance matrix
- #
- # The first row to 0..$len1
- # The first column to 0..$len2
- # The rest to 0
- #
- # The first row and column are initialized so to denote distance from the empty string
- for (my $i = 0; $i <= $len1; ++$i)
- {
- for (my $j = 0; $j <= $len2; ++$j)
- {
- $mat{$i}{$j} = 0;
- $mat{0}{$j} = $j;
- }
- $mat{$i}{0} = $i;
- }
- # Some char-by-char processing is ahead, so prepare array of chars from the strings
- my @ar1 = split(//, $s1);
- my @ar2 = split(//, $s2);
- for (my $i = 1; $i <= $len1; ++$i)
- {
- for (my $j = 1; $j <= $len2; ++$j)
- {
- # Set the cost to 1 iff the ith char of $s1
- # equals the jth of $s2
- #
- # Denotes a substitution cost. When the char are equal
- # there is no need to substitute, so the cost is 0
- my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
- # Cell $mat{$i}{$j} equals the minimum of:
- #
- # - The cell immediately above plus 1
- # - The cell immediately to the left plus 1
- # - The cell diagonally above and to the left plus the cost
- #
- # We can either insert a new char, delete a char or
- # substitute an existing char (with an associated cost)
- $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
- $mat{$i}{$j-1} + 1,
- $mat{$i-1}{$j-1} + $cost]);
- }
- }
- # Finally, the Levenshtein distance equals the rightmost bottom cell of the matrix
- # Note that $mat{$x}{$y} denotes the distance between the substrings : 1..$x and 1..$y
- return $mat{$len1}{$len2};
- }
- #该片段来自于http://www.codesnippet.cn/detail/010320132262.html
来源: http://www.codesnippet.cn/detail/010320132262.html