From 8dcf88ddb6d2fd274f63a0d1b115caf85630a8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sun, 16 May 2021 01:44:14 +0200 Subject: [PATCH] Use DAWG for fast approximate kNN over Levenshtein distance --- gensim/similarities/__init__.py | 6 +- gensim/similarities/levenshtein.py | 127 ++++++++--------------------- gensim/test/test_similarities.py | 64 --------------- setup.py | 2 +- 4 files changed, 37 insertions(+), 162 deletions(-) diff --git a/gensim/similarities/__init__.py b/gensim/similarities/__init__.py index c9fa8d8966..2e8a917e65 100644 --- a/gensim/similarities/__init__.py +++ b/gensim/similarities/__init__.py @@ -6,13 +6,13 @@ import warnings try: import Levenshtein # noqa:F401 - import vptree # noqa:F401 + import lexpy # noqa:F401 except ImportError: msg = ( "The gensim.similarities.levenshtein submodule is disabled, because the optional " "Levenshtein and " - "vptree packages are unavailable. " - "Install Levenhstein and vptree (e.g. `pip install python-Levenshtein vptree`) to " + "lexpy packages are unavailable. " + "Install Levenhstein and lexpy (e.g. `pip install python-Levenshtein lexpy`) to " "suppress this warning." ) warnings.warn(msg) diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py index d0ac75411d..13f1334f4c 100644 --- a/gensim/similarities/levenshtein.py +++ b/gensim/similarities/levenshtein.py @@ -10,94 +10,12 @@ import itertools import logging -from math import floor from gensim.similarities.termsim import TermSimilarityIndex logger = logging.getLogger(__name__) -def levdist(t1, t2, max_distance=float("inf")): - """Get the Levenshtein distance between two terms. - - Return the Levenshtein distance between two terms. The distance is a - number between <1.0, inf>, higher is less similar. - - Parameters - ---------- - t1 : {bytes, str, unicode} - The first compared term. - t2 : {bytes, str, unicode} - The second compared term. - max_distance : {int, float}, optional - If you don't care about distances larger than a known threshold, a more - efficient code path can be taken. For terms that are clearly "too far - apart", we will not compute the distance exactly, but we will return - `max(len(t1), len(t2))` more quickly, meaning "more than - `max_distance`". - Default: always compute distance exactly, no threshold clipping. - - Returns - ------- - int - The Levenshtein distance between `t1` and `t2`. - - """ - import Levenshtein - - distance = Levenshtein.distance(t1, t2) - if distance > max_distance: - return max(len(t1), len(t2)) - return distance - - -def levsim(t1, t2, alpha=1.8, beta=5.0, min_similarity=0.0): - """Get the Levenshtein similarity between two terms. - - Return the Levenshtein similarity between two terms. The similarity is a - number between <0.0, 1.0>, higher is more similar. - - Parameters - ---------- - t1 : {bytes, str, unicode} - The first compared term. - t2 : {bytes, str, unicode} - The second compared term. - alpha : float, optional - The multiplicative factor alpha defined by Charlet and Damnati (2017). - beta : float, optional - The exponential factor beta defined by Charlet and Damnati (2017). - min_similarity : {int, float}, optional - If you don't care about similarities smaller than a known threshold, a - more efficient code path can be taken. For terms that are clearly "too - far apart", we will not compute the distance exactly, but we will - return zero more quickly, meaning "less than `min_similarity`". - Default: always compute similarity exactly, no threshold clipping. - - Returns - ------- - float - The Levenshtein similarity between `t1` and `t2`. - - Notes - ----- - This notion of Levenshtein similarity was first defined in section 2.2 of - `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: - Soft-Cosine Semantic Similarity between Questions for Community Question - Answering", 2017 `_. - - """ - assert alpha >= 0 - assert beta >= 0 - - max_lengths = max(len(t1), len(t2)) or 1 - min_similarity = float(max(min(min_similarity, 1.0), 0.0)) - max_distance = int(floor(max_lengths * (1 - (min_similarity / alpha) ** (1 / beta)))) - distance = levdist(t1, t2, max_distance) - similarity = alpha * (1 - distance * 1.0 / max_lengths)**beta - return similarity - - class LevenshteinSimilarityIndex(TermSimilarityIndex): r""" Computes Levenshtein similarities between terms and retrieves most similar @@ -105,7 +23,8 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): Notes ----- - This implementation uses a VP-Tree for metric indexing. + This implementation uses a Directed Acyclic Word Graph (DAWG) + for fast nearest-neighbor retrieval of the most similar terms. Parameters ---------- @@ -115,17 +34,25 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): The multiplicative factor alpha defined by [charletetal17]_. beta : float, optional The exponential factor beta defined by [charletetal17]_. + max_distance : int, optional + The maximum Levenshtein distance of the most similar terms. + Keeping this value below 3 has a significant impact on the + retrieval performance. Default is 1. Attributes ---------- dictionary : :class:`~gensim.corpora.dictionary.Dictionary` A dictionary that specifies the considered terms. - alpha : float, optional + alpha : float The multiplicative factor alpha defined by [charletetal17]_. - beta : float, optional + beta : float The exponential factor beta defined by [charletetal17]_. - index : :class:`vptree.VPTree` - The VP-Tree metric index. + index : :class:`lexpy.dawg.DAWG` + The DAWG nearest-neighbor search index. + max_distance : int + The maximum Levenshtein distance of the most similar terms. + Keeping this value below 3 has a significant impact on the + retrieval performance. See Also -------- @@ -145,17 +72,29 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): https://www.aclweb.org/anthology/S17-2051/. """ - def __init__(self, dictionary, alpha=1.8, beta=5.0): - from vptree import VPTree - + def __init__(self, dictionary, alpha=1.8, beta=5.0, max_distance=1): self.dictionary = dictionary self.alpha = alpha self.beta = beta - terms = list(self.dictionary.values()) - self.index = VPTree(terms, levdist) + self.max_distance = max_distance + + from lexpy.dawg import DAWG + + self.index = DAWG() + terms = sorted(self.dictionary.values()) + self.index.add_all(terms) + self.index.reduce() + super(LevenshteinSimilarityIndex, self).__init__() + def _levsim(self, t1, t2): + from Levenshtein import distance + + max_lengths = max(len(t1), len(t2)) or 1 + similarity = self.alpha * (1.0 - distance(t1, t2) * 1.0 / max_lengths)**self.beta + return similarity + def most_similar(self, t1, topn=10): - terms = [term for _, term in self.index.get_n_nearest_neighbors(t1, int(topn + 1))] - most_similar = ((t2, levsim(t1, t2, self.alpha, self.beta)) for t2 in terms if t1 != t2) + terms = self.index.search_within_distance(t1, self.max_distance) + most_similar = ((t2, self._levsim(t1, t2)) for t2 in terms if t1 != t2) return itertools.islice(most_similar, topn) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index f6be0c32fe..0cb0ca56b2 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -33,7 +33,6 @@ from gensim.similarities import SparseTermSimilarityMatrix from gensim.similarities import LevenshteinSimilarityIndex from gensim.similarities.docsim import _nlargest -from gensim.similarities.levenshtein import levdist, levsim try: from pyemd import emd # noqa:F401 @@ -1544,69 +1543,6 @@ def test_inner_product_corpus_corpus_true_true(self): self.assertTrue(numpy.allclose(expected_result, result.todense())) -class TestLevenshteinDistance(unittest.TestCase): - @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled") - def test_max_distance(self): - t1 = "holiday" - t2 = "day" - max_distance = max(len(t1), len(t2)) - - self.assertEqual(4, levdist(t1, t2)) - self.assertEqual(4, levdist(t1, t2, 4)) - self.assertEqual(max_distance, levdist(t1, t2, 2)) - self.assertEqual(max_distance, levdist(t1, t2, -2)) - - -class TestLevenshteinSimilarity(unittest.TestCase): - @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled") - def test_empty_strings(self): - t1 = "" - t2 = "" - alpha = 1.8 - - self.assertEqual(alpha, levsim(t1, t2)) - - @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled") - def test_negative_hyperparameters(self): - t1 = "holiday" - t2 = "day" - alpha = 2.0 - beta = 2.0 - - with self.assertRaises(AssertionError): - levsim(t1, t2, -alpha, beta) - - with self.assertRaises(AssertionError): - levsim(t1, t2, alpha, -beta) - - with self.assertRaises(AssertionError): - levsim(t1, t2, -alpha, -beta) - - @unittest.skipIf(LevenshteinSimilarityIndex is None, "gensim.similarities.levenshtein is disabled") - def test_min_similarity(self): - t1 = "holiday" - t2 = "day" - alpha = 2.0 - beta = 2.0 - similarity = alpha * (1 - 4.0 / 7)**beta - assert similarity > 0.1 and similarity < 0.5 - - self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta)) - - self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2)) - self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, -2.0)) - - self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0)) - self.assertAlmostEqual(similarity, levsim(t1, t2, alpha, beta, 0.0)) - - self.assertEqual(similarity, levsim(t1, t2, alpha, beta, 0.1)) - self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 0.5)) - self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 1.0)) - - self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2)) - self.assertEqual(0.0, levsim(t1, t2, alpha, beta, 2.0)) - - class TestLevenshteinSimilarityIndex(unittest.TestCase): def setUp(self): self.documents = [[u"government", u"denied", u"holiday"], [u"holiday", u"slowing", u"hollingworth"]] diff --git a/setup.py b/setup.py index bad9c3e2ce..6d6ff3fa52 100644 --- a/setup.py +++ b/setup.py @@ -279,7 +279,7 @@ def run(self): 'pyemd', 'nmslib', 'python-Levenshtein >= 0.10.2', - 'vptree >= 1.2', + 'lexpy >= 0.9.8', ]) # Add additional requirements for testing on Linux that are skipped on Windows.