From 735f7019e517fbfac25de395dcdb3a9b456b26d7 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Wed, 2 Nov 2022 20:11:43 +0100 Subject: [PATCH] fix bug in JaroWinkler / Jaro --- CHANGELOG.md | 6 ++ docs/conf.py | 2 +- setup.py | 2 +- src/rapidfuzz/__init__.py | 2 +- src/rapidfuzz/distance/metrics_cpp.pyx | 4 +- tests/distance/test_Jaro.py | 93 ++++++++++++++++++++++++++ tests/distance/test_JaroWinkler.py | 53 +++++++++++++-- 7 files changed, 150 insertions(+), 12 deletions(-) create mode 100644 tests/distance/test_Jaro.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 76b86f0f..1f078ca2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Changelog + +### [2.13.1] - 2022-11-02 +#### Fixed +- fix bug in `JaroWinkler.normalized_similarity` and `Jaro.normalized_similarity` + leading to incorrect results when used in combination with the process module + ### [2.13.0] - 2022-10-30 #### Fixed - fix bug in `Levenshtein.editops` leading to crashes when used with `score_hint` diff --git a/docs/conf.py b/docs/conf.py index 63e938e8..3a76fed5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ author = "Max Bachmann" # The full version, including alpha/beta/rc tags -release = "2.13.0" +release = "2.13.1" # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 5ff87904..70842b77 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def show_message(*lines): setup_args = { "name": "rapidfuzz", - "version": "2.13.0", + "version": "2.13.1", "extras_require": {"full": ["numpy"]}, "url": "https://github.com/maxbachmann/RapidFuzz", "author": "Max Bachmann", diff --git a/src/rapidfuzz/__init__.py b/src/rapidfuzz/__init__.py index a4ab10f3..f0790795 100644 --- a/src/rapidfuzz/__init__.py +++ b/src/rapidfuzz/__init__.py @@ -3,7 +3,7 @@ """ __author__: str = "Max Bachmann" __license__: str = "MIT" -__version__: str = "2.13.0" +__version__: str = "2.13.1" from rapidfuzz import distance, fuzz, process, string_metric, utils diff --git a/src/rapidfuzz/distance/metrics_cpp.pyx b/src/rapidfuzz/distance/metrics_cpp.pyx index badcb159..d9a42d42 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyx +++ b/src/rapidfuzz/distance/metrics_cpp.pyx @@ -875,7 +875,7 @@ jaro_normalized_distance._RF_Scorer = PyCapsule_New(&JaroDistanceContext, NULL, cdef RF_Scorer JaroSimilarityContext = CreateScorerContext(NoKwargsInit, GetScorerFlagsJaroSimilarity, JaroSimilarityInit) jaro_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NULL, NULL) -jaro_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroDistanceContext, NULL, NULL) +jaro_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroSimilarityContext, NULL, NULL) ############################################### @@ -947,7 +947,7 @@ jaro_winkler_normalized_distance._RF_Scorer = PyCapsule_New(&JaroWinklerDistance cdef RF_Scorer JaroWinklerSimilarityContext = CreateScorerContext(JaroWinklerKwargsInit, GetScorerFlagsJaroWinklerSimilarity, JaroWinklerSimilarityInit) jaro_winkler_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerSimilarityContext, NULL, NULL) -jaro_winkler_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerDistanceContext, NULL, NULL) +jaro_winkler_normalized_similarity._RF_Scorer = PyCapsule_New(&JaroWinklerSimilarityContext, NULL, NULL) ############################################### # Postfix diff --git a/tests/distance/test_Jaro.py b/tests/distance/test_Jaro.py new file mode 100644 index 00000000..a309a13d --- /dev/null +++ b/tests/distance/test_Jaro.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +import unittest +import pytest + +from rapidfuzz.distance import Jaro_cpp, Jaro_py +from rapidfuzz import process_cpp, process_py + + +def scorer(scorer, s1, s2, **kwargs): + score1 = scorer(s1, s2, **kwargs) + score2 = process_cpp.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[ + 1 + ] + score3 = process_cpp.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][ + 1 + ] + score4 = process_cpp.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][ + 0 + ] + score5 = process_py.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[1] + score6 = process_py.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][1] + score7 = process_py.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][0] + assert pytest.approx(score1, score2) + assert pytest.approx(score1, score3) + assert pytest.approx(score1, score4) + assert pytest.approx(score1, score5) + assert pytest.approx(score1, score6) + assert pytest.approx(score1, score7) + return score1 + + +def jaro_distance(s1, s2, **kwargs): + sim1 = scorer(Jaro_py.distance, s1, s2, **kwargs) + sim2 = scorer(Jaro_cpp.distance, s1, s2, **kwargs) + sim3 = scorer(Jaro_py.distance, s1, s2, **kwargs) + sim4 = scorer(Jaro_cpp.distance, s1, s2, **kwargs) + assert pytest.approx(sim1, sim2) + assert pytest.approx(sim1, sim3) + assert pytest.approx(sim1, sim4) + return sim1 + + +def jaro_similarity(s1, s2, **kwargs): + sim1 = scorer(Jaro_py.similarity, s1, s2, **kwargs) + sim2 = scorer(Jaro_cpp.similarity, s1, s2, **kwargs) + sim3 = scorer(Jaro_py.normalized_similarity, s1, s2, **kwargs) + sim4 = scorer(Jaro_cpp.normalized_similarity, s1, s2, **kwargs) + sim5 = 1.0 - jaro_distance(s1, s2, **kwargs) + assert pytest.approx(sim1, sim2) + assert pytest.approx(sim1, sim3) + assert pytest.approx(sim1, sim4) + assert pytest.approx(sim1, sim5) + + return sim1 + + +class JaroTest(unittest.TestCase): + def _jaro_similarity(self, s1, s2, result): + self.assertAlmostEqual(jaro_similarity(s1, s2), result, places=4) + self.assertAlmostEqual(jaro_similarity(s2, s1), result, places=4) + + def test_hash_special_case(self): + self._jaro_similarity([0, -1], [0, -2], 0.66666) + + def test_edge_case_lengths(self): + self._jaro_similarity("", "", 0) + self._jaro_similarity("0", "0", 1) + self._jaro_similarity("00", "00", 1) + self._jaro_similarity("0", "00", 0.83333) + + self._jaro_similarity("0" * 65, "0" * 65, 1) + self._jaro_similarity("0" * 64, "0" * 65, 0.99487) + self._jaro_similarity("0" * 63, "0" * 65, 0.98974) + + s1 = "10000000000000000000000000000000000000000000000000000000000000020" + s2 = "00000000000000000000000000000000000000000000000000000000000000000" + self._jaro_similarity(s1, s2, 0.97948) + + s1 = "00000000000000100000000000000000000000010000000000000000000000000" + s2 = "0000000000000000000000000000000000000000000000000000000000000000000000000000001" + self._jaro_similarity(s2, s1, 0.92223) + + s1 = "00000000000000000000000000000000000000000000000000000000000000000" + s2 = ( + "010000000000000000000000000000000000000000000000000000000000000000" + "00000000000000000000000000000000000000000000000000000000000000" + ) + self._jaro_similarity(s2, s1, 0.83593) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/distance/test_JaroWinkler.py b/tests/distance/test_JaroWinkler.py index 872dd72d..a71693c8 100644 --- a/tests/distance/test_JaroWinkler.py +++ b/tests/distance/test_JaroWinkler.py @@ -1,18 +1,57 @@ #!/usr/bin/env python import unittest +import pytest from rapidfuzz.distance import JaroWinkler_cpp, JaroWinkler_py +from rapidfuzz import process_cpp, process_py + + +def scorer(scorer, s1, s2, **kwargs): + score1 = scorer(s1, s2, **kwargs) + score2 = process_cpp.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[ + 1 + ] + score3 = process_cpp.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][ + 1 + ] + score4 = process_cpp.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][ + 0 + ] + score5 = process_py.extractOne(s1, [s2], processor=None, scorer=scorer, **kwargs)[1] + score6 = process_py.extract(s1, [s2], processor=None, scorer=scorer, **kwargs)[0][1] + score7 = process_py.cdist([s1], [s2], processor=None, scorer=scorer, **kwargs)[0][0] + assert pytest.approx(score1, score2) + assert pytest.approx(score1, score3) + assert pytest.approx(score1, score4) + assert pytest.approx(score1, score5) + assert pytest.approx(score1, score6) + assert pytest.approx(score1, score7) + return score1 + + +def jarowinkler_distance(s1, s2, **kwargs): + sim1 = scorer(JaroWinkler_py.distance, s1, s2, **kwargs) + sim2 = scorer(JaroWinkler_cpp.distance, s1, s2, **kwargs) + sim3 = scorer(JaroWinkler_py.distance, s1, s2, **kwargs) + sim4 = scorer(JaroWinkler_cpp.distance, s1, s2, **kwargs) + assert pytest.approx(sim1, sim2) + assert pytest.approx(sim1, sim3) + assert pytest.approx(sim1, sim4) + return sim1 -def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): - return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) - +def jarowinkler_similarity(s1, s2, **kwargs): + sim1 = scorer(JaroWinkler_py.similarity, s1, s2, **kwargs) + sim2 = scorer(JaroWinkler_cpp.similarity, s1, s2, **kwargs) + sim3 = scorer(JaroWinkler_py.normalized_similarity, s1, s2, **kwargs) + sim4 = scorer(JaroWinkler_cpp.normalized_similarity, s1, s2, **kwargs) + sim5 = 1.0 - jarowinkler_distance(s1, s2, **kwargs) + assert pytest.approx(sim1, sim2) + assert pytest.approx(sim1, sim3) + assert pytest.approx(sim1, sim4) + assert pytest.approx(sim1, sim5) -def jarowinkler_similarity(*args, **kwargs): - sim1 = JaroWinkler_py.similarity(*args, **kwargs) - sim2 = JaroWinkler_cpp.similarity(*args, **kwargs) - assert isclose(sim1, sim2) return sim1