Test that the tfidf parameter has desired effect on similarity_matrix

piskvorky · Mar 17, 2018 · 609fce3 · 609fce3
1 parent 6dcb687
commit 609fce3
Showing 1 changed file with 28 additions and 2 deletions.
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from gensim.corpora import Dictionary
-from gensim.models import KeyedVectors as EuclideanKeyedVectors
+from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
 from gensim.test.utils import datapath
 
 
@@ -56,7 +56,33 @@ def test_similarity_matrix(self):
         similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
         self.assertEquals(20, np.sum(similarity_matrix == 0))
 
-        # TODO: Add unit test to check that supplied tfidf has desired effect
+        # check that processing rows in the order given by IDF has desired effect
+
+        # The complete similarity matrix we would obtain with nonzero_limit would look as follows:
+        documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
+        dictionary = Dictionary(documents)
+        tfidf = TfidfModel(dictionary=dictionary)
+
+        # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.0.
+        # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
+        #
+        # If we do not pass the tfidf parameter to the similarity_matrix
+        # method, then we process rows from 1 to 4. If we do pass the tfidf
+        # parameter to the similarity_matrix method, then we first process the
+        # rows 1, 3, 4 that correspond to terms with IDF of 1.0 and then row 2
+        # that corresponds to the term "understanding" with IDF of 0. Since the
+        # method is greedy, we end up with two different similarity matrices.
+
+        self.assertTrue(
+            np.isclose(
+                self.vectors.similarity_matrix(dictionary, nonzero_limit=2).todense(),
+                np.array([[1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])))
+        self.assertTrue(
+            np.isclose(
+                self.vectors.similarity_matrix(dictionary, tfidf, nonzero_limit=2).todense(),
+                np.array([
+                    [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0],
+                    [0, 0.90007025, 1, 0], [0.9112908, 0, 0, 1]])))
 
     def test_most_similar(self):
         """Test most_similar returns expected results."""