From 1746258d0e3015e41a7bae6d348d076282989295 Mon Sep 17 00:00:00 2001 From: saahil-mahato Date: Mon, 9 Oct 2023 15:02:04 +0545 Subject: [PATCH 1/7] Add Damerau-Levenshtein distance algorithm --- strings/damerau_levenshtein_distance.py | 73 +++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 strings/damerau_levenshtein_distance.py diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py new file mode 100644 index 000000000000..9f62165b0f34 --- /dev/null +++ b/strings/damerau_levenshtein_distance.py @@ -0,0 +1,73 @@ +""" +This script is a implementation of the Damerau-Levenshtein distance algorithm. + +It's an algorithm that measures the edit distance between two string sequences + +More information about this algorithm can be found in this wikipedia article: +https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance +""" + + +def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: + """ + Implements the Damerau-Levenshtein distance algorithm that measures + the edit distance between two string. This function calculates the true + Damerau-Levenshtein distance with adjacent transpositions. + + Parameters: + first_string (string): The first string + second_string (string): The second string + + Returns: + distance (int): The edit distance between the first and second strings + + >>> damerau_levenshtein_distance("cat", "cut") + 1 + >>> damerau_levenshtein_distance("kitten", "sitting") + 3 + >>> damerau_levenshtein_distance("hello", "world") + 4 + >>> damerau_levenshtein_distance("book", "back") + 2 + >>> damerau_levenshtein_distance("container", "containment") + 3 + """ + + # Create a dynamic programming matrix to store the distances + dp_matrix = [ + [None] * (len(second_string) + 1) for _ in range(len(first_string) + 1) + ] + + # Initialize the matrix + for i in range(len(first_string) + 1): + dp_matrix[i][0] = i + for j in range(len(second_string) + 1): + dp_matrix[0][j] = j + + # Fill the matrix + for i in range(1, len(first_string) + 1): + for j in range(1, len(second_string) + 1): + cost = 0 if first_string[i - 1] == second_string[j - 1] else 1 + + dp_matrix[i][j] = min( + dp_matrix[i - 1][j] + 1, # Deletion + dp_matrix[i][j - 1] + 1, # Insertion + dp_matrix[i - 1][j - 1] + cost, # Substitution + ) + + # Calculate Transposition + if ( + i > 1 + and j > 1 + and first_string[i - 1] == second_string[j - 2] + and first_string[i - 2] == second_string[j - 1] + ): + dp_matrix[i][j] = min(dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost) + + return dp_matrix[len(first_string)][len(second_string)] + + +if __name__ == "__main__": + import doctest + + doctest.testmod() From c482b187b7ff458bd3ab4ee086b40f1aee10c145 Mon Sep 17 00:00:00 2001 From: saahil-mahato Date: Mon, 9 Oct 2023 15:24:09 +0545 Subject: [PATCH 2/7] fix: precommit check --- strings/damerau_levenshtein_distance.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index 9f62165b0f34..5cad9993ed1a 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -34,9 +34,7 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: """ # Create a dynamic programming matrix to store the distances - dp_matrix = [ - [None] * (len(second_string) + 1) for _ in range(len(first_string) + 1) - ] + dp_matrix = [[0] * (len(second_string) + 1) for _ in range(len(first_string) + 1)] # Initialize the matrix for i in range(len(first_string) + 1): From b15b42980f7039ed432c6c003a266878c35c175c Mon Sep 17 00:00:00 2001 From: saahil-mahato Date: Fri, 13 Oct 2023 10:59:33 +0545 Subject: [PATCH 3/7] fix: doc correction --- strings/damerau_levenshtein_distance.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index 5cad9993ed1a..848316c24795 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -11,8 +11,7 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: """ Implements the Damerau-Levenshtein distance algorithm that measures - the edit distance between two string. This function calculates the true - Damerau-Levenshtein distance with adjacent transpositions. + the edit distance between two string. Parameters: first_string (string): The first string @@ -53,14 +52,13 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: dp_matrix[i - 1][j - 1] + cost, # Substitution ) - # Calculate Transposition if ( i > 1 and j > 1 and first_string[i - 1] == second_string[j - 2] and first_string[i - 2] == second_string[j - 1] ): - dp_matrix[i][j] = min(dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost) + dp_matrix[i][j] = min(dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost) # Transposition return dp_matrix[len(first_string)][len(second_string)] From 816119d6efef91c84840c428b6a3ad1f3c00e201 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 13 Oct 2023 05:15:25 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- strings/damerau_levenshtein_distance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index 848316c24795..1c07355031aa 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -58,7 +58,9 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: and first_string[i - 1] == second_string[j - 2] and first_string[i - 2] == second_string[j - 1] ): - dp_matrix[i][j] = min(dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost) # Transposition + dp_matrix[i][j] = min( + dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost + ) # Transposition return dp_matrix[len(first_string)][len(second_string)] From dd0cc2857899e64c4e424b705d8f723ee0329591 Mon Sep 17 00:00:00 2001 From: saahil-mahato Date: Fri, 13 Oct 2023 17:42:50 +0545 Subject: [PATCH 5/7] refactor: use variable for length and doc correction --- strings/damerau_levenshtein_distance.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index 1c07355031aa..f98aa61b872e 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -11,14 +11,14 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: """ Implements the Damerau-Levenshtein distance algorithm that measures - the edit distance between two string. + the edit distance between two strings. Parameters: - first_string (string): The first string - second_string (string): The second string + first_string: The first string to compare + second_string: The second string to compare Returns: - distance (int): The edit distance between the first and second strings + distance: The edit distance between the first and second strings >>> damerau_levenshtein_distance("cat", "cut") 1 @@ -32,18 +32,23 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: 3 """ + length_of_first_string = len(first_string) + length_of_second_string = len(second_string) + # Create a dynamic programming matrix to store the distances - dp_matrix = [[0] * (len(second_string) + 1) for _ in range(len(first_string) + 1)] + dp_matrix = [ + [0] * (length_of_second_string + 1) for _ in range(length_of_first_string + 1) + ] # Initialize the matrix - for i in range(len(first_string) + 1): + for i in range(length_of_first_string + 1): dp_matrix[i][0] = i - for j in range(len(second_string) + 1): + for j in range(length_of_second_string + 1): dp_matrix[0][j] = j # Fill the matrix - for i in range(1, len(first_string) + 1): - for j in range(1, len(second_string) + 1): + for i in range(1, length_of_first_string + 1): + for j in range(1, length_of_second_string + 1): cost = 0 if first_string[i - 1] == second_string[j - 1] else 1 dp_matrix[i][j] = min( @@ -62,7 +67,7 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost ) # Transposition - return dp_matrix[len(first_string)][len(second_string)] + return dp_matrix[length_of_first_string][length_of_second_string] if __name__ == "__main__": From 5b30676055595c2b5f670f78edac300749e19da6 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Oct 2023 15:14:26 +0200 Subject: [PATCH 6/7] Update damerau_levenshtein_distance.py --- strings/damerau_levenshtein_distance.py | 28 +++++++++++-------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index f98aa61b872e..e0c068e3a302 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -30,26 +30,23 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: 2 >>> damerau_levenshtein_distance("container", "containment") 3 + >>> damerau_levenshtein_distance("container", "containment") + 3 """ - - length_of_first_string = len(first_string) - length_of_second_string = len(second_string) - # Create a dynamic programming matrix to store the distances - dp_matrix = [ - [0] * (length_of_second_string + 1) for _ in range(length_of_first_string + 1) - ] + dp_matrix = [[0] * (len(second_string) + 1) for _ in range(len(first_string) + 1)] # Initialize the matrix - for i in range(length_of_first_string + 1): + for i in range(len(first_string) + 1): dp_matrix[i][0] = i - for j in range(length_of_second_string + 1): + for j in range(len(second_string) + 1): dp_matrix[0][j] = j # Fill the matrix - for i in range(1, length_of_first_string + 1): - for j in range(1, length_of_second_string + 1): - cost = 0 if first_string[i - 1] == second_string[j - 1] else 1 + # for i in range(1, length_of_first_string + 1): + for i, first_char in enumerate(first_string, start=1): + for j, second_char in enumerate(second_string, start=1): + cost = int(first_char != second_char) dp_matrix[i][j] = min( dp_matrix[i - 1][j] + 1, # Deletion @@ -63,11 +60,10 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: and first_string[i - 1] == second_string[j - 2] and first_string[i - 2] == second_string[j - 1] ): - dp_matrix[i][j] = min( - dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost - ) # Transposition + # Transposition + dp_matrix[i][j] = min(dp_matrix[i][j], dp_matrix[i - 2][j - 2] + cost) - return dp_matrix[length_of_first_string][length_of_second_string] + return dp_matrix[-1][-1] if __name__ == "__main__": From 7fda1de41a371778a2b7b8be0c548fa25fb7a37e Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Oct 2023 15:18:33 +0200 Subject: [PATCH 7/7] Update damerau_levenshtein_distance.py --- strings/damerau_levenshtein_distance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/strings/damerau_levenshtein_distance.py b/strings/damerau_levenshtein_distance.py index e0c068e3a302..72de019499e2 100644 --- a/strings/damerau_levenshtein_distance.py +++ b/strings/damerau_levenshtein_distance.py @@ -43,7 +43,6 @@ def damerau_levenshtein_distance(first_string: str, second_string: str) -> int: dp_matrix[0][j] = j # Fill the matrix - # for i in range(1, length_of_first_string + 1): for i, first_char in enumerate(first_string, start=1): for j, second_char in enumerate(second_string, start=1): cost = int(first_char != second_char)