Skip to content

Commit

Permalink
fix bug in JaroWinkler / Jaro
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Nov 2, 2022
1 parent 3b6fac0 commit 735f701
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 12 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
## Changelog


### [2.13.1] - 2022-11-02
#### Fixed
- fix bug in `JaroWinkler.normalized_similarity` and `Jaro.normalized_similarity`
leading to incorrect results when used in combination with the process module

### [2.13.0] - 2022-10-30
#### Fixed
- fix bug in `Levenshtein.editops` leading to crashes when used with `score_hint`
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = "Max Bachmann"

# The full version, including alpha/beta/rc tags
release = "2.13.0"
release = "2.13.1"


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def show_message(*lines):

setup_args = {
"name": "rapidfuzz",
"version": "2.13.0",
"version": "2.13.1",
"extras_require": {"full": ["numpy"]},
"url": "https://github.com/maxbachmann/RapidFuzz",
"author": "Max Bachmann",
Expand Down
2 changes: 1 addition & 1 deletion src/rapidfuzz/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
__author__: str = "Max Bachmann"
__license__: str = "MIT"
__version__: str = "2.13.0"
__version__: str = "2.13.1"

from rapidfuzz import distance, fuzz, process, string_metric, utils

Expand Down
4 changes: 2 additions & 2 deletions src/rapidfuzz/distance/metrics_cpp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ jaro_normalized_distance._RF_Scorer = PyCapsule_New(&JaroDistanceContext, NULL,

cdef RF_Scorer JaroSimilarityContext = CreateScorerContext(NoKwargsInit, GetScorerFlagsJaroSimilarity, JaroSimilarityInit)
jaro_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NULL, NULL)
jaro_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroDistanceContext, NULL, NULL)
jaro_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NULL, NULL)


###############################################
Expand Down Expand Up @@ -947,7 +947,7 @@ jaro_winkler_normalized_distance._RF_Scorer = PyCapsule_New(&JaroWinklerDistance

cdef RF_Scorer JaroWinklerSimilarityContext = CreateScorerContext(JaroWinklerKwargsInit, GetScorerFlagsJaroWinklerSimilarity, JaroWinklerSimilarityInit)
jaro_winkler_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerSimilarityContext, NULL, NULL)
jaro_winkler_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerDistanceContext, NULL, NULL)
jaro_winkler_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerSimilarityContext, NULL, NULL)

###############################################
# Postfix
Expand Down
93 changes: 93 additions & 0 deletions tests/distance/test_Jaro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#!/usr/bin/env python

import unittest
import pytest

from rapidfuzz.distance import Jaro_cpp, Jaro_py
from rapidfuzz import process_cpp, process_py


def scorer(scorer, s1, s2, **kwargs):
score1 = scorer(s1, s2, **kwargs)
score2 = process_cpp.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[
1
]
score3 = process_cpp.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][
1
]
score4 = process_cpp.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][
0
]
score5 = process_py.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[1]
score6 = process_py.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][1]
score7 = process_py.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][0]
assert pytest.approx(score1, score2)
assert pytest.approx(score1, score3)
assert pytest.approx(score1, score4)
assert pytest.approx(score1, score5)
assert pytest.approx(score1, score6)
assert pytest.approx(score1, score7)
return score1


def jaro_distance(s1, s2, **kwargs):
sim1 = scorer(Jaro_py.distance, s1, s2, **kwargs)
sim2 = scorer(Jaro_cpp.distance, s1, s2, **kwargs)
sim3 = scorer(Jaro_py.distance, s1, s2, **kwargs)
sim4 = scorer(Jaro_cpp.distance, s1, s2, **kwargs)
assert pytest.approx(sim1, sim2)
assert pytest.approx(sim1, sim3)
assert pytest.approx(sim1, sim4)
return sim1


def jaro_similarity(s1, s2, **kwargs):
sim1 = scorer(Jaro_py.similarity, s1, s2, **kwargs)
sim2 = scorer(Jaro_cpp.similarity, s1, s2, **kwargs)
sim3 = scorer(Jaro_py.normalized_similarity, s1, s2, **kwargs)
sim4 = scorer(Jaro_cpp.normalized_similarity, s1, s2, **kwargs)
sim5 = 1.0 - jaro_distance(s1, s2, **kwargs)
assert pytest.approx(sim1, sim2)
assert pytest.approx(sim1, sim3)
assert pytest.approx(sim1, sim4)
assert pytest.approx(sim1, sim5)

return sim1


class JaroTest(unittest.TestCase):
def _jaro_similarity(self, s1, s2, result):
self.assertAlmostEqual(jaro_similarity(s1, s2), result, places=4)
self.assertAlmostEqual(jaro_similarity(s2, s1), result, places=4)

def test_hash_special_case(self):
self._jaro_similarity([0, -1], [0, -2], 0.66666)

def test_edge_case_lengths(self):
self._jaro_similarity("", "", 0)
self._jaro_similarity("0", "0", 1)
self._jaro_similarity("00", "00", 1)
self._jaro_similarity("0", "00", 0.83333)

self._jaro_similarity("0" * 65, "0" * 65, 1)
self._jaro_similarity("0" * 64, "0" * 65, 0.99487)
self._jaro_similarity("0" * 63, "0" * 65, 0.98974)

s1 = "10000000000000000000000000000000000000000000000000000000000000020"
s2 = "00000000000000000000000000000000000000000000000000000000000000000"
self._jaro_similarity(s1, s2, 0.97948)

s1 = "00000000000000100000000000000000000000010000000000000000000000000"
s2 = "0000000000000000000000000000000000000000000000000000000000000000000000000000001"
self._jaro_similarity(s2, s1, 0.92223)

s1 = "00000000000000000000000000000000000000000000000000000000000000000"
s2 = (
"010000000000000000000000000000000000000000000000000000000000000000"
"00000000000000000000000000000000000000000000000000000000000000"
)
self._jaro_similarity(s2, s1, 0.83593)


if __name__ == "__main__":
unittest.main()
53 changes: 46 additions & 7 deletions tests/distance/test_JaroWinkler.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,57 @@
#!/usr/bin/env python

import unittest
import pytest

from rapidfuzz.distance import JaroWinkler_cpp, JaroWinkler_py
from rapidfuzz import process_cpp, process_py


def scorer(scorer, s1, s2, **kwargs):
score1 = scorer(s1, s2, **kwargs)
score2 = process_cpp.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[
1
]
score3 = process_cpp.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][
1
]
score4 = process_cpp.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][
0
]
score5 = process_py.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[1]
score6 = process_py.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][1]
score7 = process_py.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][0]
assert pytest.approx(score1, score2)
assert pytest.approx(score1, score3)
assert pytest.approx(score1, score4)
assert pytest.approx(score1, score5)
assert pytest.approx(score1, score6)
assert pytest.approx(score1, score7)
return score1


def jarowinkler_distance(s1, s2, **kwargs):
sim1 = scorer(JaroWinkler_py.distance, s1, s2, **kwargs)
sim2 = scorer(JaroWinkler_cpp.distance, s1, s2, **kwargs)
sim3 = scorer(JaroWinkler_py.distance, s1, s2, **kwargs)
sim4 = scorer(JaroWinkler_cpp.distance, s1, s2, **kwargs)
assert pytest.approx(sim1, sim2)
assert pytest.approx(sim1, sim3)
assert pytest.approx(sim1, sim4)
return sim1


def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

def jarowinkler_similarity(s1, s2, **kwargs):
sim1 = scorer(JaroWinkler_py.similarity, s1, s2, **kwargs)
sim2 = scorer(JaroWinkler_cpp.similarity, s1, s2, **kwargs)
sim3 = scorer(JaroWinkler_py.normalized_similarity, s1, s2, **kwargs)
sim4 = scorer(JaroWinkler_cpp.normalized_similarity, s1, s2, **kwargs)
sim5 = 1.0 - jarowinkler_distance(s1, s2, **kwargs)
assert pytest.approx(sim1, sim2)
assert pytest.approx(sim1, sim3)
assert pytest.approx(sim1, sim4)
assert pytest.approx(sim1, sim5)

def jarowinkler_similarity(*args, **kwargs):
sim1 = JaroWinkler_py.similarity(*args, **kwargs)
sim2 = JaroWinkler_cpp.similarity(*args, **kwargs)
assert isclose(sim1, sim2)
return sim1


Expand Down

0 comments on commit 735f701

Please sign in to comment.