Skip to content

Commit

Permalink
Merge pull request #9 from opensanctions/faster-levenshtein
Browse files Browse the repository at this point in the history
feat: switch to a faster levenshtein implementation
  • Loading branch information
pudo authored Aug 26, 2024
2 parents cfc8a58 + 4f23029 commit 6893234
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies = [
"banal >= 1.0.6, < 1.1.0",
"normality >= 2.4.0, < 3.0.0",
"jellyfish >= 1.0.0, < 2.0.0",
"rapidfuzz >= 3.9.0, < 4.0.0",
"fingerprints >= 1.0.1, < 2.0.0",
"python-stdnum >= 1.16, < 2.0.0",
"pytz >= 2021.1",
Expand Down
15 changes: 8 additions & 7 deletions rigour/text/distance.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import math
from typing import Optional
from functools import lru_cache
from jellyfish import damerau_levenshtein_distance, levenshtein_distance
from jellyfish import jaro_winkler_similarity
from rapidfuzz.distance import Levenshtein, DamerauLevenshtein, JaroWinkler

from rigour import env

Expand All @@ -23,7 +22,7 @@ def dam_levenshtein(left: str, right: str) -> int:
"""
if left == right:
return 0
return damerau_levenshtein_distance(left[:MAX_TEXT], right[:MAX_TEXT])
return DamerauLevenshtein.distance(left[:MAX_TEXT], right[:MAX_TEXT])


@lru_cache(maxsize=CACHE)
Expand All @@ -37,7 +36,7 @@ def levenshtein(left: str, right: str) -> int:
Returns:
An integer of changed characters.
"""
return levenshtein_distance(left[:MAX_TEXT], right[:MAX_TEXT])
return Levenshtein.distance(left[:MAX_TEXT], right[:MAX_TEXT])


def levenshtein_similarity(
Expand All @@ -46,7 +45,7 @@ def levenshtein_similarity(
max_edits: Optional[int] = env.LEVENSHTEIN_MAX_EDITS,
max_percent: float = env.LEVENSHTEIN_MAX_PERCENT,
) -> float:
"""Compute the levenshtein similarity of two strings. The similiarity is
"""Compute the Damerau Levenshtein similarity of two strings. The similiarity is
the percentage distance measured against the length of the longest string.
Args:
Expand Down Expand Up @@ -96,7 +95,9 @@ def is_levenshtein_plausible(
"""
pct_edits = math.ceil(min(len(left), len(right)) * max_percent)
max_edits_ = min(max_edits, pct_edits) if max_edits is not None else pct_edits
return dam_levenshtein(left, right) <= max_edits_
return (
DamerauLevenshtein.distance(left, right, score_cutoff=max_edits_) <= max_edits_
)


@lru_cache(maxsize=CACHE)
Expand All @@ -110,5 +111,5 @@ def jaro_winkler(left: str, right: str) -> float:
Returns:
A float between 0.0 and 1.0.
"""
score = jaro_winkler_similarity(left[:MAX_TEXT], right[:MAX_TEXT])
score = JaroWinkler.normalized_similarity(left[:MAX_TEXT], right[:MAX_TEXT])
return score if score > 0.6 else 0.0

0 comments on commit 6893234

Please sign in to comment.